{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,6]],"date-time":"2026-03-06T04:11:27Z","timestamp":1772770287604,"version":"3.50.1"},"publisher-location":"Cham","reference-count":28,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783032049773","type":"print"},{"value":"9783032049780","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,9,19]],"date-time":"2025-09-19T00:00:00Z","timestamp":1758240000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,9,19]],"date-time":"2025-09-19T00:00:00Z","timestamp":1758240000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-3-032-04978-0_8","type":"book-chapter","created":{"date-parts":[[2025,9,18]],"date-time":"2025-09-18T16:17:18Z","timestamp":1758212238000},"page":"78-88","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":3,"title":["CoCa-CXR: Contrastive Captioners Learn Strong Temporal Structures for\u00a0Chest X-Ray Vision-Language Understanding"],"prefix":"10.1007","author":[{"given":"Yixiong","family":"Chen","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shawn","family":"Xu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Andrew","family":"Sellergren","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yossi","family":"Matias","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Avinatan","family":"Hassidim","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shravya","family":"Shetty","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Daniel","family":"Golden","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Alan L.","family":"Yuille","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Lin","family":"Yang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,9,19]]},"reference":[{"key":"8_CR1","unstructured":"Bannur, S., et\u00a0al.: MAIRA-2: grounded radiology report generation. arXiv preprint arXiv:2406.04449 (2024)"},{"key":"8_CR2","doi-asserted-by":"crossref","unstructured":"Bannur, S., et\u00a0al.: Learning to exploit temporal structure for biomedical vision-language processing. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15016\u201315027 (2023)","DOI":"10.1109\/CVPR52729.2023.01442"},{"key":"8_CR3","unstructured":"Bannur, S., et al.: MS-CXR-T: learning to exploit temporal structure for biomedical vision-language processing (version 1.0.0). PhysioNet (2023). https:\/\/doi.org\/10.13026\/pg10-j984"},{"key":"8_CR4","series-title":"LNCS","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1007\/978-3-031-20059-5_1","volume-title":"ECCV 2022","author":"B Boecking","year":"2022","unstructured":"Boecking, B., et al.: Making the most of text semantics to improve biomedical vision-language processing. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13696, pp. 1\u201321. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20059-5_1"},{"key":"8_CR5","series-title":"LNCS","doi-asserted-by":"publisher","first-page":"706","DOI":"10.1007\/978-3-031-43907-0_67","volume-title":"MICCAI 2023","author":"Y Chen","year":"2023","unstructured":"Chen, Y., Liu, L., Li, J., Jiang, H., Ding, C., Zhou, Z.: MetaLR: meta-tuning of learning rates for transfer learning in medical imaging. In: Greenspan, H., et al. (eds.) MICCAI 2023. LNCS, vol. 14220, pp. 706\u2013716. Springer, Cham (2023). https:\/\/doi.org\/10.1007\/978-3-031-43907-0_67"},{"key":"8_CR6","doi-asserted-by":"crossref","unstructured":"Cheng, B., Misra, I., Schwing, A.G., Kirillov, A., Girdhar, R.: Masked-attention mask transformer for universal image segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1290\u20131299 (2022)","DOI":"10.1109\/CVPR52688.2022.00135"},{"key":"8_CR7","unstructured":"Cho, Y., Kim, T., Shin, H., Cho, S., Shin, D.: Pretraining vision-language model for difference visual question answering in longitudinal chest X-rays. In: Medical Imaging with Deep Learning (2024)"},{"key":"8_CR8","unstructured":"Dosovitskiy, A.: An image is worth 16x16 words: transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)"},{"key":"8_CR9","unstructured":"Endo, M., Krishnan, R., Krishna, V., Ng, A.Y., Rajpurkar, P.: Retrieval-based chest X-ray report generation using a pre-trained contrastive language-image model. In: Machine Learning for Health, pp. 209\u2013219. PMLR (2021)"},{"key":"8_CR10","doi-asserted-by":"crossref","unstructured":"Goldberger, A.L., et al.: PhysioBank, PhysioToolkit, and PhysioNet: components of a new research resource for complex physiologic signals. Circulation 101(23), e215\u2013e220 (2000)","DOI":"10.1161\/01.CIR.101.23.e215"},{"key":"8_CR11","doi-asserted-by":"crossref","unstructured":"Huang, S.C., Shen, L., Lungren, M.P., Yeung, S.: Gloria: a multimodal global-local representation learning framework for label-efficient medical image recognition. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 3942\u20133951 (2021)","DOI":"10.1109\/ICCV48922.2021.00391"},{"key":"8_CR12","unstructured":"Johnson, A., Pollard, T., Mark, R., Berkowitz, S., Horng, S.: Mimic-CXR database (version 2.0.0). PhysioNet (2019). https:\/\/doi.org\/10.13026\/C2JT1Q"},{"key":"8_CR13","doi-asserted-by":"crossref","unstructured":"Johnson, A.E., et al.: Mimic-CXR, a de-identified publicly available database of chest radiographs with free-text reports. Sci. Data 6(1), 317 (2019)","DOI":"10.1038\/s41597-019-0322-0"},{"key":"8_CR14","series-title":"LNCS","doi-asserted-by":"publisher","first-page":"581","DOI":"10.1007\/978-3-031-16431-6_55","volume-title":"MICCAI 2022","author":"G Karwande","year":"2022","unstructured":"Karwande, G., Mbakwe, A.B., Wu, J.T., Celi, L.A., Moradi, M., Lourentzou, I.: CheXRelNet: an anatomy-aware model for tracking longitudinal relationships between chest X-rays. In: Wang, L., Dou, Q., Fletcher, P.T., Speidel, S., Li, S. (eds.) MICCAI 2022. LNCS, vol. 13431, pp. 581\u2013591. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-16431-6_55"},{"key":"8_CR15","doi-asserted-by":"crossref","unstructured":"Li, C., Wong, C., et al.: LLaVA-Med: training a large language-and-vision assistant for biomedicine in one day. Adv. Neural Inf. Process. Syst. 36 (2024)","DOI":"10.32388\/VLXB6M"},{"key":"8_CR16","doi-asserted-by":"crossref","unstructured":"Miura, Y., Zhang, Y., Tsai, E., Langlotz, C., Jurafsky, D.: Improving factual completeness and consistency of image-to-text radiology report generation. In: Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, pp. 5288\u20135304 (2021)","DOI":"10.18653\/v1\/2021.naacl-main.416"},{"key":"8_CR17","doi-asserted-by":"publisher","DOI":"10.1016\/j.artmed.2023.102633","volume":"144","author":"A Nicolson","year":"2023","unstructured":"Nicolson, A., Dowling, J., Koopman, B.: Improving chest X-ray report generation by leveraging warm starting. Artif. Intell. Med. 144, 102633 (2023)","journal-title":"Artif. Intell. Med."},{"key":"8_CR18","unstructured":"Steiner, A., et\u00a0al.: PaliGemma 2: a family of versatile VLMs for transfer. arXiv preprint arXiv:2412.03555 (2024)"},{"key":"8_CR19","unstructured":"Tanno, R., et\u00a0al.: Consensus, dissensus and synergy between clinicians and specialist foundation models in radiology report generation. arXiv preprint arXiv:2311.18260 (2023)"},{"key":"8_CR20","unstructured":"Team, G., et\u00a0al.: Gemini: a family of highly capable multimodal models. arXiv preprint arXiv:2312.11805 (2023)"},{"key":"8_CR21","doi-asserted-by":"crossref","unstructured":"Tu, T., et\u00a0al.: Towards generalist biomedical AI. NEJM AI 1(3), AIoa2300138 (2024)","DOI":"10.1056\/AIoa2300138"},{"key":"8_CR22","unstructured":"Vaswani, A.: Attention is all you need. Adv. Neural Inf. Process. Syst. (2017)"},{"key":"8_CR23","doi-asserted-by":"publisher","first-page":"183","DOI":"10.1007\/978-3-031-73001-6_11","volume-title":"ECCV 2024LNCS","author":"F Wang","year":"2024","unstructured":"Wang, F., Du, S., Yu, L.: HERGen: elevating radiology report generation with longitudinal data. In: Leonardis, A., Ricci, E., Roth, S., Russakovsky, O., Sattler, T., Varol, G. (eds.) ECCV 2024LNCS, vol. 15113, pp. 183\u2013200. Springer, Cham (2024). https:\/\/doi.org\/10.1007\/978-3-031-73001-6_11"},{"key":"8_CR24","unstructured":"Wu, J., et al.: Chest imagenome dataset (version 1.0.0). PhysioNet (2021). https:\/\/doi.org\/10.13026\/wv01-y230"},{"key":"8_CR25","unstructured":"Wu, J.T., et\u00a0al.: Chest imagenome dataset for clinical reasoning. arXiv preprint arXiv:2108.00316 (2021)"},{"key":"8_CR26","unstructured":"Yang, J., Su, B., Zhao, W.X., Wen, J.R.: Unlocking the power of spatial and temporal information in medical multimodal pre-training. arXiv preprint arXiv:2405.19654 (2024)"},{"key":"8_CR27","unstructured":"Yang, L., et\u00a0al.: Advancing multimodal medical capabilities of Gemini. arXiv preprint arXiv:2405.03162 (2024)"},{"key":"8_CR28","unstructured":"Yu, J., Wang, Z., Vasudevan, V., Yeung, L., Seyedhosseini, M., Wu, Y.: CoCa: contrastive captioners are image-text foundation models. Trans. Mach. Learn. Res. (2022)"}],"container-title":["Lecture Notes in Computer Science","Medical Image Computing and Computer Assisted Intervention \u2013 MICCAI 2025"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-032-04978-0_8","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,18]],"date-time":"2025-09-18T22:06:14Z","timestamp":1758233174000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-032-04978-0_8"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,9,19]]},"ISBN":["9783032049773","9783032049780"],"references-count":28,"URL":"https:\/\/doi.org\/10.1007\/978-3-032-04978-0_8","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,9,19]]},"assertion":[{"value":"19 September 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"The authors\u00a0have no competing interests to declare that are relevant to the content of this article.","order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Disclosure of Interests"}},{"value":"MICCAI","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Medical Image Computing and Computer-Assisted Intervention","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Daejeon","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Korea (Republic of)","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"23 September 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27 September 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"28","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"miccai2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/conferences.miccai.org\/2025\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}