{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,23]],"date-time":"2025-09-23T00:36:36Z","timestamp":1758587796590,"version":"3.44.0"},"publisher-location":"Cham","reference-count":41,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783032060037","type":"print"},{"value":"9783032060044","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,9,22]],"date-time":"2025-09-22T00:00:00Z","timestamp":1758499200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,9,22]],"date-time":"2025-09-22T00:00:00Z","timestamp":1758499200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-3-032-06004-4_22","type":"book-chapter","created":{"date-parts":[[2025,9,22]],"date-time":"2025-09-22T17:21:34Z","timestamp":1758561694000},"page":"215-225","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["MoViS: Motion-guided Video Generation for\u00a0Laparoscopic Surgery"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-3114-6729","authenticated-orcid":false,"given":"Yousef","family":"Yeganeh","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6032-5611","authenticated-orcid":false,"given":"Nassir","family":"Navab","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1080-1587","authenticated-orcid":false,"given":"Azade","family":"Farshad","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,9,22]]},"reference":[{"key":"22_CR1","unstructured":"Blattmann, A., et al.: Stable video diffusion: scaling latent video diffusion models to large datasets (2023)"},{"key":"22_CR2","doi-asserted-by":"crossref","unstructured":"Blattmann, A., et al.: Align your latents: high-resolution video synthesis with latent diffusion models. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.02161"},{"key":"22_CR3","unstructured":"Bodenstedt, S., et\u00a0al.: Comparative evaluation of instrument segmentation and tracking methods in minimally invasive surgery. arXiv (2018)"},{"key":"22_CR4","doi-asserted-by":"publisher","unstructured":"Budd, C., Vercauteren, T.: Transferring relative monocular depth to surgical vision with temporal consistency. In: International Conference on Medical Image Computing and Computer-Assisted Intervention, pp. 692\u2013702. Springer, Heidelberg (2024). https:\/\/doi.org\/10.1007\/978-3-031-72089-5_65","DOI":"10.1007\/978-3-031-72089-5_65"},{"key":"22_CR5","unstructured":"Cho, J., et al.: Surgen: text-guided diffusion model for surgical video generation. arXiv (2024)"},{"key":"22_CR6","doi-asserted-by":"crossref","unstructured":"Dhamo, H., et al.: Semantic image manipulation using scene graphs. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5213\u20135222 (2020)","DOI":"10.1109\/CVPR42600.2020.00526"},{"key":"22_CR7","first-page":"8780","volume":"34","author":"P Dhariwal","year":"2021","unstructured":"Dhariwal, P., Nichol, A.: Diffusion models beat gans on image synthesis. Adv. Neural. Inf. Process. Syst. 34, 8780\u20138794 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"22_CR8","doi-asserted-by":"crossref","unstructured":"Farshad, A., Yeganeh, Y., Chi, Y., Shen, C., Ommer, B., Navab, N.: Scenegenie: scene graph guided diffusion models for image synthesis. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 88\u201398 (2023)","DOI":"10.1109\/ICCVW60793.2023.00016"},{"key":"22_CR9","unstructured":"Farshad, A., Yeganeh, Y., Dhamo, H., Tombari, F., Navab, N.: Dispositionet: disentangled pose and identity in semantic image manipulation. In: 33rd British Machine Vision Conference 2022, BMVC 2022, London, UK, 21\u201324 November 2022. BMVA Press (2022)"},{"key":"22_CR10","doi-asserted-by":"crossref","unstructured":"Hedlin, E., et al.: Unsupervised keypoints from pretrained diffusion models. In: CVPR (2024)","DOI":"10.1109\/CVPR52733.2024.02153"},{"key":"22_CR11","first-page":"6840","volume":"33","author":"J Ho","year":"2020","unstructured":"Ho, J., Jain, A., Abbeel, P.: Denoising diffusion probabilistic models. Adv. Neural. Inf. Process. Syst. 33, 6840\u20136851 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"22_CR12","unstructured":"Ho, J., Salimans, T., Gritsenko, A., Chan, W., Norouzi, M., Fleet, D.J.: Video diffusion models. In: NeurIPS (2022)"},{"key":"22_CR13","doi-asserted-by":"crossref","unstructured":"Hu, Y., Luo, C., Chen, Z.: Make it move: controllable image-to-video generation with text descriptions. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.01768"},{"key":"22_CR14","doi-asserted-by":"publisher","unstructured":"Iliash, I., Allmendinger, S., Meissen, F., K\u00fchl, N., R\u00fcckert, D.: Interactive generation of laparoscopic videos with diffusion models. In: MICCAI Workshop on Deep Generative Models, pp. 109\u2013118. Springer, Heidelberg (2024). https:\/\/doi.org\/10.1007\/978-3-031-72744-3_11","DOI":"10.1007\/978-3-031-72744-3_11"},{"key":"22_CR15","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"214","DOI":"10.1007\/978-3-030-00937-3_25","volume-title":"Medical Image Computing and Computer Assisted Intervention \u2013 MICCAI 2018","author":"H Ismail Fawaz","year":"2018","unstructured":"Ismail Fawaz, H., Forestier, G., Weber, J., Idoumghar, L., Muller, P.-A.: Evaluating surgical skills from kinematic data using convolutional neural networks. In: Frangi, A.F., Schnabel, J.A., Davatzikos, C., Alberola-L\u00f3pez, C., Fichtinger, G. (eds.) MICCAI 2018. LNCS, vol. 11073, pp. 214\u2013221. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-00937-3_25"},{"key":"22_CR16","doi-asserted-by":"crossref","unstructured":"Jin, A., et al.: Tool detection and operative skill assessment in surgical videos using region-based convolutional neural networks. In: 2018 IEEE Winter Conference on Applications of Computer Vision (WACV), pp. 691\u2013699. IEEE (2018)","DOI":"10.1109\/WACV.2018.00081"},{"key":"22_CR17","doi-asserted-by":"crossref","unstructured":"Khachatryan, L., et al.: Text2video-zero: text-to-image diffusion models are zero-shot video generators. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 15954\u201315964 (2023)","DOI":"10.1109\/ICCV51070.2023.01462"},{"key":"22_CR18","doi-asserted-by":"publisher","unstructured":"K\u00f6ksal, \u00c7., Ghazaei, G., Holm, F., Farshad, A., Navab, N.: Sangria: surgical video scene graph optimization for surgical workflow prediction. In: International Workshop on Graphs in Biomedical Image Analysis, pp. 106\u2013117. Springer, Heidelberg (2024). https:\/\/doi.org\/10.1007\/978-3-031-83243-7_10","DOI":"10.1007\/978-3-031-83243-7_10"},{"key":"22_CR19","doi-asserted-by":"publisher","unstructured":"Li, C., et al.: Endora: video generation models as endoscopy simulators. In: International Conference on Medical Image Computing and Computer-Assisted Intervention, pp. 230\u2013240. Springer, Heidelberg (2024). https:\/\/doi.org\/10.1007\/978-3-031-72089-5_22","DOI":"10.1007\/978-3-031-72089-5_22"},{"key":"22_CR20","doi-asserted-by":"crossref","unstructured":"Lin, W., et al.: Instrument-tissue interaction detection framework for surgical video understanding. IEEE Trans. Med. Imaging (2024)","DOI":"10.1109\/TMI.2024.3381209"},{"key":"22_CR21","doi-asserted-by":"crossref","unstructured":"Lu, J., Jayakumari, A., Richter, F., Li, Y., Yip, M.C.: Super deep: a surgical perception framework for robotic tissue manipulation using deep learning for feature extraction. In: 2021 IEEE International Conference on Robotics and Automation (ICRA), pp. 4783\u20134789. IEEE (2021)","DOI":"10.1109\/ICRA48506.2021.9561249"},{"key":"22_CR22","doi-asserted-by":"publisher","unstructured":"L\u00fcpke, S., Yeganeh, Y., Adeli, E., Navab, N., Farshad, A.: Physics-informed latent diffusion for multimodal brain mri synthesis. In: International Conference on Medical Image Computing and Computer-Assisted Intervention, pp. 198\u2013207. Springer, Heidelberg (2024). https:\/\/doi.org\/10.1007\/978-3-031-84525-3_17","DOI":"10.1007\/978-3-031-84525-3_17"},{"key":"22_CR23","doi-asserted-by":"publisher","first-page":"230","DOI":"10.1016\/j.media.2018.06.005","volume":"48","author":"F Mahmood","year":"2018","unstructured":"Mahmood, F., Durr, N.J.: Deep learning and conditional random fields-based depth estimation and topographical reconstruction from conventional endoscopy. Med. Image Anal. 48, 230\u2013243 (2018)","journal-title":"Med. Image Anal."},{"key":"22_CR24","doi-asserted-by":"crossref","unstructured":"Maier-Hein, L., et\u00a0al.: Surgical data science for next-generation interventions. Nat. Biomed. Eng. 1 (2017)","DOI":"10.1038\/s41551-017-0132-7"},{"key":"22_CR25","unstructured":"Moing, G.L., Ponce, J., Schmid, C.: WALDO: future video synthesis using object layer decomposition and parametric flow prediction. In: ICCV (2023)"},{"key":"22_CR26","unstructured":"Mostafa, M.L., et al.: Surgical flow masked autoencoder for event recognition. In: Medical Imaging with Deep Learning (2025)"},{"key":"22_CR27","doi-asserted-by":"crossref","unstructured":"Ni, H., Shi, C., Li, K., Huang, S.X., Min, M.R.: Conditional image-to-video generation with latent flow diffusion models. In: CVPR, pp. 18444\u201318455 (2023)","DOI":"10.1109\/CVPR52729.2023.01769"},{"key":"22_CR28","doi-asserted-by":"crossref","unstructured":"Nwoye, C.I., et al.: Rendezvous: attention mechanisms for the recognition of surgical action triplets in endoscopic videos. Med. Image Anal. (2022)","DOI":"10.1016\/j.media.2022.102433"},{"key":"22_CR29","doi-asserted-by":"publisher","DOI":"10.1016\/j.media.2023.102888","volume":"89","author":"CI Nwoye","year":"2023","unstructured":"Nwoye, C.I., et al.: Cholectriplet 2022: show me a tool and tell me the triplet\u2013an endoscopic vision challenge for surgical action triplet detection. Med. Image Anal. 89, 102888 (2023)","journal-title":"Med. Image Anal."},{"key":"22_CR30","doi-asserted-by":"crossref","unstructured":"Rivoir, D., et al.: Long-term temporally consistent unpaired video translation from simulated surgical 3d data. In: ICCV, pp. 3343\u20133353 (2021)","DOI":"10.1109\/ICCV48922.2021.00333"},{"key":"22_CR31","unstructured":"Singer, U., et\u00a0al.: Make-a-video: text-to-video generation without text-video data. arXiv (2022)"},{"key":"22_CR32","unstructured":"Song, Y., Sohl-Dickstein, J., Kingma, D.P., Kumar, A., Ermon, S., Poole, B.: Score-based generative modeling through stochastic differential equations. arXiv (2020)"},{"key":"22_CR33","unstructured":"Tang, Z., Yang, Z., Zhu, C., Zeng, M., Bansal, M.: Any-to-any generation via composable diffusion. In: NeurIPS (2023)"},{"issue":"1","key":"22_CR34","doi-asserted-by":"publisher","first-page":"86","DOI":"10.1109\/TMI.2016.2593957","volume":"36","author":"AP Twinanda","year":"2016","unstructured":"Twinanda, A.P., Shehata, S., Mutter, D., Marescaux, J., De Mathelin, M., Padoy, N.: Endonet: a deep architecture for recognition tasks on laparoscopic videos. IEEE Trans. Med. Imaging 36(1), 86\u201397 (2016)","journal-title":"IEEE Trans. Med. Imaging"},{"key":"22_CR35","unstructured":"Vondrick, C., Pirsiavash, H., Torralba, A.: Generating videos with scene dynamics. Adv. Neural Inf. Process. Syst. 29 (2016)"},{"key":"22_CR36","doi-asserted-by":"crossref","unstructured":"Wang, Z., et al.: Motionctrl: a unified and flexible motion controller for video generation. In: ACM SIGGRAPH 2024 Conference Papers, pp. 1\u201311 (2024)","DOI":"10.1145\/3641519.3657518"},{"key":"22_CR37","doi-asserted-by":"crossref","unstructured":"Yang, L., Kang, B., Huang, Z., Xu, X., Feng, J., Zhao, H.: Depth anything: unleashing the power of large-scale unlabeled data. In: CVPR (2024)","DOI":"10.1109\/CVPR52733.2024.00987"},{"key":"22_CR38","doi-asserted-by":"crossref","unstructured":"Yeganeh, Y., et al.: Latent drifting in diffusion models for counterfactual medical image synthesis. In: Proceedings of the Computer Vision and Pattern Recognition Conference, pp. 7685\u20137695 (2025)","DOI":"10.1109\/CVPR52734.2025.00720"},{"key":"22_CR39","doi-asserted-by":"crossref","unstructured":"Yeganeh, Y., et al.: Visage: video synthesis using action graphs for surgery. In: International Conference on Medical Image Computing and Computer-Assisted Intervention, pp. 146\u2013156. Springer, Heidelberg (2024)","DOI":"10.1007\/978-3-031-77610-6_14"},{"key":"22_CR40","doi-asserted-by":"crossref","unstructured":"Zhang, L., Rao, A., Agrawala, M.: Adding conditional control to text-to-image diffusion models. In: ICCV, pp. 3836\u20133847 (2023)","DOI":"10.1109\/ICCV51070.2023.00355"},{"key":"22_CR41","doi-asserted-by":"crossref","unstructured":"Zhao, Z., Fang, F., Yang, X., Xu, Q., Guan, C., Zhou, S.K.: See, predict, plan: diffusion for procedure planning in robotic surgical videos. In: International Conference on Medical Image Computing and Computer-Assisted Intervention, pp. 553\u2013563. Springer, Heidelberg (2024)","DOI":"10.1007\/978-3-031-72089-5_52"}],"container-title":["Lecture Notes in Computer Science","AI for Clinical Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-032-06004-4_22","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,22]],"date-time":"2025-09-22T17:21:48Z","timestamp":1758561708000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-032-06004-4_22"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,9,22]]},"ISBN":["9783032060037","9783032060044"],"references-count":41,"URL":"https:\/\/doi.org\/10.1007\/978-3-032-06004-4_22","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,9,22]]},"assertion":[{"value":"22 September 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"The authors have no competing interests to declare that are relevant to the content of this article.","order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Disclosure of Interests"}},{"value":"CREATE","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Workshop on Clinical-Driven Robotics and Embodied AI Technology","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Daejeon","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Korea (Republic of)","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27 September 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27 September 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"1","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"create2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/sites.google.com\/view\/create-2025\/home","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}