{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,11]],"date-time":"2025-10-11T15:13:16Z","timestamp":1760195596212,"version":"build-2065373602"},"publisher-location":"Cham","reference-count":14,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783032078445","type":"print"},{"value":"9783032078452","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,10,12]],"date-time":"2025-10-12T00:00:00Z","timestamp":1760227200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,10,12]],"date-time":"2025-10-12T00:00:00Z","timestamp":1760227200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-3-032-07845-2_15","type":"book-chapter","created":{"date-parts":[[2025,10,11]],"date-time":"2025-10-11T14:28:20Z","timestamp":1760192900000},"page":"150-158","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Temporally-Constrained Video Reasoning Segmentation and\u00a0Automated Benchmark Construction"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-7866-3339","authenticated-orcid":false,"given":"Yiqing","family":"Shen","sequence":"first","affiliation":[]},{"given":"Chenjia","family":"Li","sequence":"additional","affiliation":[]},{"given":"Chenxiao","family":"Fan","sequence":"additional","affiliation":[]},{"given":"Mathias","family":"Unberath","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,10,12]]},"reference":[{"key":"15_CR1","unstructured":"Bai, S., et\u00a0al.: Qwen2. 5-VL technical report. arXiv preprint arXiv:2502.13923 (2025)"},{"key":"15_CR2","doi-asserted-by":"crossref","unstructured":"Demir, K.C., Rodriguez, B.O., Weise, T., Maier, A., Yang, S.H.: Towards intelligent speech assistants in operating rooms: a multimodal model for surgical workflow analysis. arXiv preprint arXiv:2406.14576 (2024)","DOI":"10.21437\/Interspeech.2024-975"},{"issue":"2","key":"15_CR3","doi-asserted-by":"publisher","first-page":"375","DOI":"10.1007\/s11548-023-02971-6","volume":"19","author":"M Grammatikopoulou","year":"2024","unstructured":"Grammatikopoulou, M., et al.: A spatio-temporal network for video semantic segmentation in surgical videos. Int. J. Comput. Assist. Radiol. Surg. 19(2), 375\u2013382 (2024)","journal-title":"Int. J. Comput. Assist. Radiol. Surg."},{"issue":"7","key":"15_CR4","doi-asserted-by":"publisher","first-page":"1911","DOI":"10.1109\/TMI.2021.3069471","volume":"40","author":"Y Jin","year":"2021","unstructured":"Jin, Y., Long, Y., Chen, C., Zhao, Z., Dou, Q., Heng, P.-A.: Temporal memory relation network for workflow recognition from surgical video. IEEE Trans. Med. Imaging 40(7), 1911\u20131923 (2021)","journal-title":"IEEE Trans. Med. Imaging"},{"key":"15_CR5","doi-asserted-by":"crossref","unstructured":"Lai, X., Tian, Z., Chen, Y., et\u00a0al.: LISA: reasoning segmentation via large language model. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9579\u20139589 (2024)","DOI":"10.1109\/CVPR52733.2024.00915"},{"key":"15_CR6","doi-asserted-by":"crossref","unstructured":"Liu, H., Li, C., Li, Y., Lee, Y.J.: Improved baselines with visual instruction tuning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 26296\u201326306 (2024)","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"15_CR7","unstructured":"Ravi, N., Gabeur, V., Hu, Y.-T., et\u00a0al.: SAM 2: segment anything in images and videos. arXiv preprint arXiv:2408.00714 (2024)"},{"key":"15_CR8","unstructured":"Shen, Y., Ding, H., Seenivasan, L., Shu, T., Unberath, M.: Position: foundation models need digital twin representations. arXiv preprint arXiv:2505.03798 (2025)"},{"key":"15_CR9","unstructured":"Shen, Y., Li, C., Fan, C., Unberath, M.: RVTBench: a benchmark for visual reasoning tasks. arXiv preprint arXiv:2505.11838 (2025)"},{"key":"15_CR10","doi-asserted-by":"crossref","unstructured":"Shen, Y., Li, C., Liu, B., Li, C.-Y., Porras, T., Unberath, M.: Operating room workflow analysis via reasoning segmentation over digital twins. arXiv preprint arXiv:2503.21054 (2025)","DOI":"10.1007\/978-3-032-05114-1_40"},{"key":"15_CR11","unstructured":"Shen, Y., et al.: Reasoning segmentation for images and videos: a survey. arXiv preprint arXiv:2505.18816 (2025)"},{"key":"15_CR12","unstructured":"Srivastav, V., Issenhuth, T., Kadkhodamohammadi, A., de\u00a0Mathelin, M., Gangi, A., Padoy, N.: MVOR: a multi-view RGB-D operating room dataset for 2D and 3D human pose estimation. arXiv preprint arXiv:1808.08180 (2018)"},{"key":"15_CR13","doi-asserted-by":"crossref","unstructured":"Xu, J., Sirajudeen, N., Boal, M., Francis, N., Stoyanov, D., Mazomenos, E.B.: SedMamba: enhancing selective state space modelling with bottleneck mechanism and fine-to-coarse temporal fusion for efficient error detection in robot-assisted surgery. IEEE Robot. Autom. Lett. (2024)","DOI":"10.1109\/LRA.2024.3505818"},{"key":"15_CR14","unstructured":"Yang, L., Kang, B., Huang, Z., et\u00a0al.: Depth anything v2. arXiv preprint arXiv:2406.09414 (2024)"}],"container-title":["Lecture Notes in Computer Science","Foundation Models for General Medical AI"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-032-07845-2_15","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,10,11]],"date-time":"2025-10-11T14:28:22Z","timestamp":1760192902000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-032-07845-2_15"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,12]]},"ISBN":["9783032078445","9783032078452"],"references-count":14,"URL":"https:\/\/doi.org\/10.1007\/978-3-032-07845-2_15","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,10,12]]},"assertion":[{"value":"12 October 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"MedAGI","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Workshop on Foundation Models for General Medical AI","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Daejeon","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Korea (Republic of)","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27 September 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27 September 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"3","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"medagi2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/medagi2025.github.io\/#\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}