{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,6]],"date-time":"2026-06-06T16:25:29Z","timestamp":1780763129243,"version":"3.54.1"},"publisher-location":"Cham","reference-count":31,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031720826","type":"print"},{"value":"9783031720833","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024]]},"DOI":"10.1007\/978-3-031-72083-3_35","type":"book-chapter","created":{"date-parts":[[2024,10,13]],"date-time":"2024-10-13T18:01:42Z","timestamp":1728842502000},"page":"373-383","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":20,"title":["PathM3: A Multimodal Multi-task Multiple Instance Learning Framework for\u00a0Whole Slide Image Classification and\u00a0Captioning"],"prefix":"10.1007","author":[{"given":"Qifeng","family":"Zhou","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Wenliang","family":"Zhong","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yuzhi","family":"Guo","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Michael","family":"Xiao","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Hehuan","family":"Ma","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Junzhou","family":"Huang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2024,10,14]]},"reference":[{"key":"35_CR1","first-page":"23716","volume":"35","author":"JB Alayrac","year":"2022","unstructured":"Alayrac, J.B., Donahue, J., Luc, P., Miech, A., Barr, I., Hasson, Y., Lenc, K., Mensch, A., Millican, K., Reynolds, M., et\u00a0al.: Flamingo: a visual language model for few-shot learning. Advances in Neural Information Processing Systems 35, 23716\u201323736 (2022)","journal-title":"Advances in Neural Information Processing Systems"},{"key":"35_CR2","doi-asserted-by":"crossref","unstructured":"Chen, R.J., Lu, M.Y., Weng, W.H., Chen, T.Y., Williamson, D.F., Manz, T., Shady, M., Mahmood, F.: Multimodal co-attention transformer for survival prediction in gigapixel whole slide images. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. pp. 4015\u20134025 (2021)","DOI":"10.1109\/ICCV48922.2021.00398"},{"issue":"70","key":"35_CR3","first-page":"1","volume":"25","author":"HW Chung","year":"2024","unstructured":"Chung, H.W., Hou, L., Longpre, S., Zoph, B., Tay, Y., Fedus, W., Li, Y., Wang, X., Dehghani, M., Brahma, S., et\u00a0al.: Scaling instruction-finetuned language models. Journal of Machine Learning Research 25(70), 1\u201353 (2024)","journal-title":"Journal of Machine Learning Research"},{"key":"35_CR4","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: Bert: Pre-training of deep bidirectional transformers for language understanding. In: Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers). pp. 4171\u20134186 (2019)"},{"key":"35_CR5","doi-asserted-by":"crossref","unstructured":"Ding, K., Zhou, M., Metaxas, D.N., Zhang, S.: Pathology-and-genomics multimodal transformer for survival outcome prediction. In: International Conference on Medical Image Computing and Computer-Assisted Intervention. pp. 622\u2013631. Springer (2023)","DOI":"10.1007\/978-3-031-43987-2_60"},{"key":"35_CR6","doi-asserted-by":"crossref","unstructured":"Gamper, J., Rajpoot, N.: Multiple instance captioning: Learning representations from histopathology textbooks and articles. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. pp. 16549\u201316559 (2021)","DOI":"10.1109\/CVPR46437.2021.01628"},{"issue":"9","key":"35_CR7","doi-asserted-by":"publisher","first-page":"2307","DOI":"10.1038\/s41591-023-02504-3","volume":"29","author":"Z Huang","year":"2023","unstructured":"Huang, Z., Bianchi, F., Yuksekgonul, M., Montine, T.J., Zou, J.: A visual\u2013language foundation model for pathology image analysis using medical twitter. Nature medicine 29(9), 2307\u20132316 (2023)","journal-title":"Nature medicine"},{"key":"35_CR8","unstructured":"Ilse, M., Tomczak, J., Welling, M.: Attention-based deep multiple instance learning. In: International conference on machine learning. pp. 2127\u20132136. PMLR (2018)"},{"key":"35_CR9","doi-asserted-by":"crossref","unstructured":"Li, B., Li, Y., Eliceiri, K.W.: Dual-stream multiple instance learning network for whole slide image classification with self-supervised contrastive learning. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. pp. 14318\u201314328 (2021)","DOI":"10.1109\/CVPR46437.2021.01409"},{"key":"35_CR10","doi-asserted-by":"crossref","unstructured":"Li, C., Zhu, X., Yao, J., Huang, J.: Hierarchical transformer for survival prediction using multimodality whole slide images and genomics. In: 2022 26th international conference on pattern recognition (ICPR). pp. 4256\u20134262. IEEE (2022)","DOI":"10.1109\/ICPR56361.2022.9956296"},{"key":"35_CR11","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.: Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In: International conference on machine learning. pp. 19730\u201319742. PMLR (2023)"},{"key":"35_CR12","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning. Advances in neural information processing systems 36 (2024)"},{"key":"35_CR13","doi-asserted-by":"crossref","unstructured":"Lu, M.Y., Chen, B., Zhang, A., Williamson, D.F., Chen, R.J., Ding, T., Le, L.P., Chuang, Y.S., Mahmood, F.: Visual language pretrained multiple instance zero-shot transfer for histopathology images. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 19764\u201319775 (2023)","DOI":"10.1109\/CVPR52729.2023.01893"},{"issue":"6","key":"35_CR14","doi-asserted-by":"publisher","first-page":"555","DOI":"10.1038\/s41551-020-00682-w","volume":"5","author":"MY Lu","year":"2021","unstructured":"Lu, M.Y., Williamson, D.F., Chen, T.Y., Chen, R.J., Barbieri, M., Mahmood, F.: Data-efficient and weakly supervised computational pathology on whole-slide images. Nature biomedical engineering 5(6), 555\u2013570 (2021)","journal-title":"Nature biomedical engineering"},{"key":"35_CR15","unstructured":"Qu, L., Fu, K., Wang, M., Song, Z., et\u00a0al.: The rise of ai language pathologists: Exploring two-level prompt learning for few-shot weakly-supervised whole slide image classification. Advances in Neural Information Processing Systems 36 (2024)"},{"key":"35_CR16","unstructured":"Radford, A., Kim, J.W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., Sastry, G., Askell, A., Mishkin, P., Clark, J., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International conference on machine learning. pp. 8748\u20138763. PMLR (2021)"},{"key":"35_CR17","first-page":"2136","volume":"34","author":"Z Shao","year":"2021","unstructured":"Shao, Z., Bian, H., Chen, Y., Wang, Y., Zhang, J., Ji, X., et\u00a0al.: Transmil: Transformer based correlated multiple instance learning for whole slide image classification. Advances in neural information processing systems 34, 2136\u20132147 (2021)","journal-title":"Advances in neural information processing systems"},{"key":"35_CR18","unstructured":"Tsuneki, M., Kanavati, F.: Inference of captions from histopathological patches. In: International Conference on Medical Imaging with Deep Learning. pp. 1235\u20131250. PMLR (2022)"},{"key":"35_CR19","doi-asserted-by":"crossref","unstructured":"Wang, P., Wells, W.M., Berkowitz, S., Horng, S., Golland, P.: Using multiple instance learning to build multimodal representations. In: International Conference on Information Processing in Medical Imaging. pp. 457\u2013470. Springer (2023)","DOI":"10.1007\/978-3-031-34048-2_35"},{"key":"35_CR20","doi-asserted-by":"crossref","unstructured":"Wang, X., Peng, Y., Lu, L., Lu, Z., Summers, R.M.: Tienet: Text-image embedding network for common thorax disease classification and reporting in chest x-rays. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp. 9049\u20139058 (2018)","DOI":"10.1109\/CVPR.2018.00943"},{"key":"35_CR21","unstructured":"Xiang, J., Zhang, J.: Exploring low-rank property in multiple instance learning for whole slide image classification. In: The Eleventh International Conference on Learning Representations (2022)"},{"issue":"3","key":"35_CR22","doi-asserted-by":"publisher","first-page":"135","DOI":"10.55524\/ijircst.2024.12.3.22","volume":"12","author":"L Xiao","year":"2024","unstructured":"Xiao, L., Xu, R., Cang, Y., Chen, Y., Wei, Y.: Advancing surgical imaging with cgan for effective defogging. International Journal of Innovative Research in Computer Science & Technology 12(3), 135\u2013139 (2024)","journal-title":"International Journal of Innovative Research in Computer Science & Technology"},{"key":"35_CR23","doi-asserted-by":"crossref","unstructured":"Xiong, Y., Zeng, Z., Chakraborty, R., Tan, M., Fung, G., Li, Y., Singh, V.: Nystr\u00f6mformer: A nystr\u00f6m-based algorithm for approximating self-attention. In: Proceedings of the AAAI Conference on Artificial Intelligence. vol.\u00a035, pp. 14138\u201314148 (2021)","DOI":"10.1609\/aaai.v35i16.17664"},{"key":"35_CR24","doi-asserted-by":"crossref","unstructured":"Xu, Y., Chen, H.: Multimodal optimal transport-based co-attention transformer with global structure consistency for survival prediction. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. pp. 21241\u201321251 (2023)","DOI":"10.1109\/ICCV51070.2023.01942"},{"key":"35_CR25","doi-asserted-by":"crossref","unstructured":"Yan, Y., He, S., Yu, Z., Yuan, J., Liu, Z., Chen, Y.: Investigation of customized medical decision algorithms utilizing graph neural networks. arXiv preprint arXiv:2405.17460 (2024)","DOI":"10.1109\/ICSECE61636.2024.10729331"},{"key":"35_CR26","doi-asserted-by":"publisher","DOI":"10.1016\/j.media.2020.101789","volume":"65","author":"J Yao","year":"2020","unstructured":"Yao, J., Zhu, X., Jonnagaddala, J., Hawkins, N., Huang, J.: Whole slide images based cancer survival prediction using attention guided deep multiple instance learning networks. Medical Image Analysis 65, 101789 (2020)","journal-title":"Medical Image Analysis"},{"key":"35_CR27","doi-asserted-by":"crossref","unstructured":"Yao, J., Zhu, X., Zhu, F., Huang, J.: Deep correlational learning for survival prediction from multi-modality data. In: International Conference on Medical Image Computing and Computer-Assisted Intervention. pp. 406\u2013414. Springer (2017)","DOI":"10.1007\/978-3-319-66185-8_46"},{"key":"35_CR28","unstructured":"Zhang, Y., Gao, J., Tan, Z., Zhou, L., Ding, K., Zhou, M., Zhang, S., Wang, D.: Data-centric foundation models in computational healthcare: A survey. arXiv preprint arXiv:2401.02458 (2024)"},{"key":"35_CR29","doi-asserted-by":"crossref","unstructured":"Zhang, Y., Gao, J., Zhou, M., Wang, X., Qiao, Y., Zhang, S., Wang, D.: Text-guided foundation model adaptation for pathological image classification. In: International Conference on Medical Image Computing and Computer-Assisted Intervention. pp. 272\u2013282. Springer (2023)","DOI":"10.1007\/978-3-031-43904-9_27"},{"key":"35_CR30","doi-asserted-by":"crossref","unstructured":"Zhou, F., Chen, H.: Cross-modal translation and alignment for survival analysis. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. pp. 21485\u201321494 (2023)","DOI":"10.1109\/ICCV51070.2023.01964"},{"key":"35_CR31","doi-asserted-by":"crossref","unstructured":"Zhu, X., Yao, J., Zhu, F., Huang, J.: Wsisa: Making survival prediction from whole slide histopathological images. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp. 7234\u20137242 (2017)","DOI":"10.1109\/CVPR.2017.725"}],"container-title":["Lecture Notes in Computer Science","Medical Image Computing and Computer Assisted Intervention \u2013 MICCAI 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72083-3_35","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,29]],"date-time":"2024-11-29T15:37:19Z","timestamp":1732894639000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72083-3_35"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024]]},"ISBN":["9783031720826","9783031720833"],"references-count":31,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72083-3_35","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024]]},"assertion":[{"value":"14 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"The authors have no competing interests to declare that are relevant to the content of this article.","order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Disclosure of Interests"}},{"value":"MICCAI","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Medical Image Computing and Computer-Assisted Intervention","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Marrakesh","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Morocco","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"7 October 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"11 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"miccai2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/conferences.miccai.org\/2024\/en\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}