{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,15]],"date-time":"2026-04-15T23:53:58Z","timestamp":1776297238914,"version":"3.50.1"},"publisher-location":"Cham","reference-count":28,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031720888","type":"print"},{"value":"9783031720895","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024]]},"DOI":"10.1007\/978-3-031-72089-5_67","type":"book-chapter","created":{"date-parts":[[2024,10,2]],"date-time":"2024-10-02T16:02:20Z","timestamp":1727884940000},"page":"714-724","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["Tri-Modal Confluence with\u00a0Temporal Dynamics for\u00a0Scene Graph Generation in\u00a0Operating Rooms"],"prefix":"10.1007","author":[{"given":"Diandian","family":"Guo","sequence":"first","affiliation":[]},{"given":"Manxi","family":"Lin","sequence":"additional","affiliation":[]},{"given":"Jialun","family":"Pei","sequence":"additional","affiliation":[]},{"given":"He","family":"Tang","sequence":"additional","affiliation":[]},{"given":"Yueming","family":"Jin","sequence":"additional","affiliation":[]},{"given":"Pheng-Ann","family":"Heng","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,10,3]]},"reference":[{"key":"67_CR1","unstructured":"Alayrac, J.B., Donahue, J., Luc, P., et\u00a0al, A.M.: Flamingo: a visual language model for few-shot learning. In: NeurIPS (2022)"},{"issue":"1","key":"67_CR2","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1109\/TPAMI.2021.3137605","volume":"45","author":"X Chang","year":"2021","unstructured":"Chang, X., Ren, P., Xu, P., Li, Z., Chen, X., Hauptmann, A.: A comprehensive survey of scene graphs: generation and application. IEEE TPAMI 45(1), 1\u201326 (2021)","journal-title":"IEEE TPAMI"},{"issue":"9","key":"67_CR3","doi-asserted-by":"publisher","first-page":"11169","DOI":"10.1109\/TPAMI.2023.3268066","volume":"45","author":"Y Cong","year":"2023","unstructured":"Cong, Y., Yang, M.Y., Rosenhahn, B.: Reltr: relation transformer for scene graph generation. IEEE TPAMI 45(9), 11169\u201311183 (2023)","journal-title":"IEEE TPAMI"},{"key":"67_CR4","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.J., Li, K., Fei-Fei, L.: Imagenet: a large-scale hierarchical image database. In: IEEE CVPR (2009)","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"67_CR5","doi-asserted-by":"crossref","unstructured":"Ding, R., Yang, J., Xue, C., Zhang, W., Bai, S., Qi, X.: Pla: language-driven open-vocabulary 3d scene understanding. In: IEEE CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.00677"},{"key":"67_CR6","doi-asserted-by":"crossref","unstructured":"Ege \u00d6zsoy, Czempiel, T., Evin P\u0131nar \u00d6rnek, Eck, U., Tombari, F., Navab, N.: Holistic or domain modeling: a semantic scene graph approach. IJCARS (2023)","DOI":"10.1007\/s11548-023-03022-w"},{"key":"67_CR7","doi-asserted-by":"crossref","unstructured":"Fan, H., Yang, Y., Kankanhalli, M.: Point 4d transformer networks for spatio-temporal modeling in point cloud videos. In: IEEE CVPR (2021)","DOI":"10.1109\/CVPR46437.2021.01398"},{"key":"67_CR8","unstructured":"Gao, K., Chen, L., Zhang, H., Xiao, J., Sun, Q.: Compositional prompt tuning with motion cues for open-vocabulary video relation detection. In: ICLR (2023)"},{"key":"67_CR9","doi-asserted-by":"crossref","unstructured":"Gao, X., Jin, Y., Long, Y., Dou, Q., Heng, P.A.: Trans-svnet: Accurate phase recognition from surgical videos via hybrid embedding aggregation transformer. In: MICCAI (2021)","DOI":"10.1007\/978-3-030-87202-1_57"},{"key":"67_CR10","doi-asserted-by":"publisher","first-page":"3728","DOI":"10.1002\/mp.13002","volume":"45","author":"OL Green","year":"2018","unstructured":"Green, O.L., Rankine, L.J., Cai, B., Curcuru, A., Kashani, R., Rodriguez, V., Li, H.H., Parikh, P.J., Robinson, C.G., Olsen, J.R., et\u00a0al.: First clinical implementation of real-time, real anatomy tracking and radiation beam control. Med. Phys. 45, 3728-3740 (2018)","journal-title":"Med. Phys."},{"key":"67_CR11","doi-asserted-by":"crossref","unstructured":"He, K., Gkioxari, G., Doll\u00e1r, P., Girshick, R.: Mask r-cnn. In: IEEE ICCV (2017)","DOI":"10.1109\/ICCV.2017.322"},{"key":"67_CR12","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: IEEE CVPR (2015)","DOI":"10.1109\/CVPR.2016.90"},{"key":"67_CR13","doi-asserted-by":"crossref","unstructured":"He, T., Gao, L., Song, J., Li, Y.F.: Towards open-vocabulary scene graph generation with prompt-based finetuning. In: ECCV (2022)","DOI":"10.1007\/978-3-031-19815-1_4"},{"key":"67_CR14","unstructured":"Li, C., Wong, C., Zhang, S., Usuyama, N., Liu, H., Yang, J., Naumann, T., Poon, H., Gao, J.: Llava-med: training a large language-and-vision assistant for biomedicine in one day. In: NeurIPS (2023)"},{"key":"67_CR15","doi-asserted-by":"crossref","unstructured":"Liao, Y., Zhang, A., Lu, M., Wang, Y., Li, X., Liu, S.: Gen-vlkt: simplify association and enhance interaction understanding for hoi detection. In: IEEE CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.01949"},{"key":"67_CR16","doi-asserted-by":"crossref","unstructured":"Lin, T.Y., Goyal, P., Girshick, R.B., He, K., Doll\u00e1r, P.: Focal loss for dense object detection. In: IEEE ICCV (2017)","DOI":"10.1109\/ICCV.2017.324"},{"key":"67_CR17","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning. In: NeurIPS (2023)"},{"key":"67_CR18","doi-asserted-by":"crossref","unstructured":"Liu, J., Zhang, Y., Chen, J.N., Xiao, J., Lu, Y., A\u00a0Landman, B., Yuan, Y., Yuille, A., Tang, Y., Zhou, Z.: Clip-driven universal model for organ segmentation and tumor detection. In: IEEE ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.01934"},{"key":"67_CR19","unstructured":"Radford, A., Kim, J.W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., Sastry, G., Askell, A., Mishkin, P., Clark, J., Krueger, G., Sutskever, I.: Learning transferable visual models from natural language supervision. In: ICML (2021)"},{"key":"67_CR20","doi-asserted-by":"crossref","unstructured":"Rezatofighi, H., Tsoi, N., Gwak, J., Sadeghian, A., Reid, I., Savarese, S.: Generalized intersection over union: a metric and a loss for bounding box regression. In: IEEE CVPR (2019)","DOI":"10.1109\/CVPR.2019.00075"},{"key":"67_CR21","doi-asserted-by":"publisher","DOI":"10.1016\/j.media.2023.102751","volume":"85","author":"L Sestini","year":"2023","unstructured":"Sestini, L., Rosa, B., De\u00a0Momi, E., Ferrigno, G., Padoy, N.: Fun-sis: A fully unsupervised approach for surgical instrument segmentation. Med. Image Anal. 85, 102751 (2023)","journal-title":"Med. Image Anal."},{"key":"67_CR22","doi-asserted-by":"publisher","first-page":"172","DOI":"10.1038\/s41586-023-06291-2","volume":"620","author":"K Singhal","year":"2023","unstructured":"Singhal, K., Azizi, S., Tu, T., Mahdavi, S.S., Wei, J., Chung, H.W., Scales, N., Tanwani, A., Cole-Lewis, H., Pfohl, S., et\u00a0al.: Large language models encode clinical knowledge. Nature 620, 172-180 (2023)","journal-title":"Nature"},{"key":"67_CR23","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A.N., Kaiser, L., Polosukhin, I.: Attention is all you need. In: NeurIPS (2017)"},{"key":"67_CR24","doi-asserted-by":"crossref","unstructured":"Wald, J., Dhamo, H., Navab, N., Tombari, F.: Learning 3d semantic scene graphs from 3d indoor reconstructions. In: IEEE CVPR (2020)","DOI":"10.1109\/CVPR42600.2020.00402"},{"issue":"3","key":"67_CR25","doi-asserted-by":"publisher","first-page":"630","DOI":"10.1007\/s11263-021-01546-9","volume":"130","author":"J Wald","year":"2022","unstructured":"Wald, J., Navab, N., Tombari, F.: Learning 3d semantic scene graphs with instance embeddings. IJCV 130(3), 630\u2013651 (2022)","journal-title":"IJCV"},{"key":"67_CR26","doi-asserted-by":"crossref","unstructured":"Wang, Z., Cheng, B., Zhao, L., Xu, D., Tang, Y., Sheng, L.: Vl-sat: visual-linguistic semantics assisted training for 3d semantic scene graph prediction in point cloud. In: IEEE CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.02065"},{"key":"67_CR27","doi-asserted-by":"crossref","unstructured":"\u00d6zsoy, E., Czempiel, T., Holm, F., Pellegrini, C., Navab, N.: Labrad-or: lightweight memory scene graphs for accurate bimodal reasoning in dynamic operating rooms. In: MICCAI (2023)","DOI":"10.1007\/978-3-031-43996-4_29"},{"key":"67_CR28","doi-asserted-by":"crossref","unstructured":"\u00d6zsoy, E., \u00d6rnek, E.P., Eck, U., Czempiel, T., Tombari, F., Navab, N.: 4d-or: semantic scene graphs for or domain modeling. In: MICCAI (2022)","DOI":"10.1007\/978-3-031-16449-1_45"}],"container-title":["Lecture Notes in Computer Science","Medical Image Computing and Computer Assisted Intervention \u2013 MICCAI 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72089-5_67","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,2]],"date-time":"2024-10-02T16:10:18Z","timestamp":1727885418000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72089-5_67"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024]]},"ISBN":["9783031720888","9783031720895"],"references-count":28,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72089-5_67","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024]]},"assertion":[{"value":"3 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"The authors have no competing interests to declare.","order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Disclosure of Interests"}},{"value":"MICCAI","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Medical Image Computing and Computer-Assisted Intervention","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Marrakesh","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Morocco","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"7 October 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"11 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"miccai2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/conferences.miccai.org\/2024\/en\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}