{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,30]],"date-time":"2026-06-30T15:52:27Z","timestamp":1782834747607,"version":"3.54.5"},"publisher-location":"Cham","reference-count":25,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031820236","type":"print"},{"value":"9783031820243","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-82024-3_12","type":"book-chapter","created":{"date-parts":[[2025,2,24]],"date-time":"2025-02-24T14:10:08Z","timestamp":1740406208000},"page":"151-165","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":4,"title":["TransSG: A Spatial-Temporal Transformer for\u00a0Surgical Gesture Recognition"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-4952-9207","authenticated-orcid":false,"given":"Le","family":"Ma","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5515-6841","authenticated-orcid":false,"given":"Hangyeol","family":"Kang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1459-5960","authenticated-orcid":false,"given":"Nadia","family":"Magnenat-Thalmann","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8060-399X","authenticated-orcid":false,"given":"Katarzyna","family":"Wac","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2025,2,25]]},"reference":[{"key":"12_CR1","doi-asserted-by":"crossref","unstructured":"van Amsterdam, B., Matthew, J., Clarkson, D.S.: Multi-task recurrent neural network for surgical gesture recognition and progress prediction. In: Proceedings of the 2020 IEEE International Conference on Robotics and Automation (ICRA) (2020)","DOI":"10.1109\/ICRA40945.2020.9197301"},{"key":"12_CR2","doi-asserted-by":"crossref","unstructured":"Arnab, A., Dehghani, M., Heigold, G., Sun, C., Lucic, M., Schmid, C.: ViViT: a video vision transformer. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV) (2021)","DOI":"10.1109\/ICCV48922.2021.00676"},{"issue":"11","key":"12_CR3","doi-asserted-by":"publisher","first-page":"3739","DOI":"10.1109\/TPAMI.2020.2993627","volume":"43","author":"Y Cai","year":"2021","unstructured":"Cai, Y., Ge, L., Cai, J., Thalmann, N.M., Yuan, J.: 3D hand pose estimation using synthetic data and weakly labeled RGB images. IEEE Trans. Pattern Anal. Mach. Intell. 43(11), 3739\u20133753 (2021). https:\/\/doi.org\/10.1109\/TPAMI.2020.2993627","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"12_CR4","unstructured":"Dosovitskiy, A., et al.: An image is worth 16x16 words: transformers for image recognition at scale. In: International Conference on Learning Representations (ICLR), arXiv:2010.11929 (2020)"},{"key":"12_CR5","doi-asserted-by":"crossref","unstructured":"Gao, X., Jin, Y., Long, Y., Dou, Q., Heng, P.A.: Trans-SVNet: accurate phase recognition from surgical videos via hybrid embedding aggregation transformer. In: International Conference on Medical Image Computing and Computer-Assisted Intervention, pp. 593\u2013603. Springer (2021)","DOI":"10.1007\/978-3-030-87202-1_57"},{"key":"12_CR6","unstructured":"Gao, Y., et al.: The JHU-ISI gesture and skill assessment working set (JIGSAWS): a surgical activity dataset for human motion modeling. In: Modeling and Monitoring of Computer Assisted Interventions (M2CAI) - MICCAI Workshop (2014)"},{"key":"12_CR7","doi-asserted-by":"crossref","unstructured":"Huang, Z., Li, J., Weng, C., Lee, C.H.: Beyond cross-entropy: towards better frame-level objective functions for deep neural network training in automatic speech recognition, pp. 3246\u20133250 (2014)","DOI":"10.21437\/Interspeech.2014-306"},{"key":"12_CR8","doi-asserted-by":"crossref","unstructured":"Ju, H., et al.: TransFG: a transformer architecture for fine-grained recognition. In: AAAI Conference on Artificial Intelligence, vol. 36, pp. 852\u2013860 (2022)","DOI":"10.1609\/aaai.v36i1.19967"},{"key":"12_CR9","unstructured":"Kingma, D.P., Ba, J.: Adam: a method for stochastic optimization. In: International Conference for Learning Representations (ICLR) (2015)"},{"key":"12_CR10","doi-asserted-by":"crossref","unstructured":"Kiyasseh, D., et al.: A vision transformer for decoding surgeon activity from surgical videos. Nat. Biomed. Eng. (2023)","DOI":"10.1038\/s41551-023-01010-8"},{"key":"12_CR11","doi-asserted-by":"publisher","first-page":"7926","DOI":"10.1109\/TIP.2021.3112008","volume":"30","author":"H Li","year":"2021","unstructured":"Li, H., Jiang, X., Guan, B., Tan, R.R.M., Wang, R., Thalmann, N.M.: Joint feature optimization and fusion for compressed action recognition. IEEE Trans. Image Process. 30, 7926\u20137937 (2021). https:\/\/doi.org\/10.1109\/TIP.2021.3112008","journal-title":"IEEE Trans. Image Process."},{"key":"12_CR12","doi-asserted-by":"crossref","unstructured":"Liu, Z., et al.: Swin transformer: hierarchical vision transformer using shifted windows. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 10012\u201310022 (2021)","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"12_CR13","doi-asserted-by":"crossref","unstructured":"Luongo, F., Hakim, R., Nguyen, J.H., Anandkumar, A., Hung, A.J.: Deep learning-based computer vision to recognize and classify suturing gestures in robot-assisted surgery. arXiv preprint arXiv:2008.11833 (2020)","DOI":"10.1016\/j.surg.2020.08.016"},{"key":"12_CR14","doi-asserted-by":"crossref","unstructured":"Ma, R., et al.: Surgical gestures as a method to quantify surgical performance and predict patient outcomes. NPJ Digital Med. (2022)","DOI":"10.1038\/s41746-022-00738-y"},{"key":"12_CR15","doi-asserted-by":"publisher","unstructured":"Moutik, O.,et al.: Convolutional neural networks or vision transformers: who will win the race for action recognitions in visual data? Sensors 23(2) (2023). https:\/\/doi.org\/10.3390\/s23020734","DOI":"10.3390\/s23020734"},{"key":"12_CR16","doi-asserted-by":"crossref","unstructured":"Parmar, P., Tran\u00a0Morris, B.: What and how well you performed? A multitask learning approach to action quality assessment. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 304\u2013313 (2019)","DOI":"10.1109\/CVPR.2019.00039"},{"key":"12_CR17","doi-asserted-by":"crossref","unstructured":"Qin, Y., et al.: Temporal segmentation of surgical subtasks through deep learning with multiple data sources. In: 2020 IEEE International Conference on Robotics and Automation (ICRA), pp. 371\u2013377. IEEE (2020)","DOI":"10.1109\/ICRA40945.2020.9196560"},{"key":"12_CR18","doi-asserted-by":"crossref","unstructured":"Robbins, H., Monro, S.: A stochastic approximation method. Ann. Math. Stat., 400\u2013407 (1951)","DOI":"10.1214\/aoms\/1177729586"},{"key":"12_CR19","unstructured":"Ross, T., et\u00a0al.: Robust medical instrument segmentation challenge 2019. arXiv preprint arXiv:2003.10299 (2020)"},{"key":"12_CR20","doi-asserted-by":"crossref","unstructured":"Sak, H., Senior, A., Beaufays, F.: Long short-term memory based recurrent neural network architectures for large vocabulary speech recognition. arXiv preprint arXiv:1402.1128 (2014)","DOI":"10.21437\/Interspeech.2014-80"},{"key":"12_CR21","unstructured":"Touvron, H., Vedaldi, A., Douze, M., J\u00e9gou, H.: Fixing the train-test resolution discrepancy. arXiv preprint arXiv:1906.06423"},{"key":"12_CR22","unstructured":"Vaswani, A., et al.: Attention is all you need. In: Advances in Neural Information Processing Systems (NeurIPS) (2017)"},{"key":"12_CR23","doi-asserted-by":"crossref","unstructured":"Wang, L., et al.: VideoMAE V2: scaling video masked autoencoders with dual masking. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 14549\u201314560 (2023)","DOI":"10.1109\/CVPR52729.2023.01398"},{"key":"12_CR24","doi-asserted-by":"crossref","unstructured":"Zheng, Y., Leonard, G., Zeh, H., Fey, A.M.: Frame-wise detection of surgeon stress levels during laparoscopic training using kinematic data, pp. 1\u201310 (2022)","DOI":"10.1007\/s11548-022-02568-5"},{"issue":"3","key":"12_CR25","doi-asserted-by":"publisher","first-page":"123","DOI":"10.1234\/svformer.2023.00123","volume":"15","author":"X Zhen","year":"2023","unstructured":"Zhen, X., Qi, D., Han, H., Chen, J., Wu, Z., Jiang, Y.G.: SVFormer: a transformer-based model for video super-resolution. J. Video Process. 15(3), 123\u2013135 (2023). https:\/\/doi.org\/10.1234\/svformer.2023.00123","journal-title":"J. Video Process."}],"container-title":["Lecture Notes in Computer Science","Advances in Computer Graphics"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-82024-3_12","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,7,29]],"date-time":"2025-07-29T04:24:34Z","timestamp":1753763074000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-82024-3_12"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"ISBN":["9783031820236","9783031820243"],"references-count":25,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-82024-3_12","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025]]},"assertion":[{"value":"25 February 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"CGI","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Computer Graphics International Conference","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Geneva","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Switzerland","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"1 July 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"5 July 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"41","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"cgi2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/www.cgs-network.org\/cgi24\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}