{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,16]],"date-time":"2026-04-16T10:08:42Z","timestamp":1776334122151,"version":"3.51.2"},"publisher-location":"Cham","reference-count":86,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031200588","type":"print"},{"value":"9783031200595","type":"electronic"}],"license":[{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022]]},"DOI":"10.1007\/978-3-031-20059-5_1","type":"book-chapter","created":{"date-parts":[[2022,10,28]],"date-time":"2022-10-28T16:02:50Z","timestamp":1666972970000},"page":"1-21","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":208,"title":["Making the\u00a0Most of\u00a0Text Semantics to\u00a0Improve Biomedical Vision\u2013Language Processing"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-4822-0531","authenticated-orcid":false,"given":"Benedikt","family":"Boecking","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0888-929X","authenticated-orcid":false,"given":"Naoto","family":"Usuyama","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5750-7628","authenticated-orcid":false,"given":"Shruthi","family":"Bannur","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6829-7045","authenticated-orcid":false,"given":"Daniel C.","family":"Castro","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1557-0527","authenticated-orcid":false,"given":"Anton","family":"Schwaighofer","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7571-6036","authenticated-orcid":false,"given":"Stephanie","family":"Hyland","sequence":"additional","affiliation":[]},{"given":"Maria","family":"Wetscherek","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2150-1747","authenticated-orcid":false,"given":"Tristan","family":"Naumann","sequence":"additional","affiliation":[]},{"given":"Aditya","family":"Nori","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0906-4177","authenticated-orcid":false,"given":"Javier","family":"Alvarez-Valle","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9067-0918","authenticated-orcid":false,"given":"Hoifung","family":"Poon","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2976-0874","authenticated-orcid":false,"given":"Ozan","family":"Oktay","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2022,10,29]]},"reference":[{"key":"1_CR1","doi-asserted-by":"publisher","unstructured":"Akbari, H., Karaman, S., Bhargava, S., Chen, B., Vondrick, C., Chang, S.F.: Multi-level multimodal common semantic space for image-phrase grounding. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2019, Long Beach, CA, USA, 16\u201320 June 2019, pp. 12476\u201312486. Computer Vision Foundation\/IEEE (2019). https:\/\/doi.org\/10.1109\/CVPR.2019.01276","DOI":"10.1109\/CVPR.2019.01276"},{"key":"1_CR2","doi-asserted-by":"publisher","unstructured":"Alsentzer, E., et al.: Publicly available clinical BERT embeddings. In: Proceedings of the 2nd Clinical Natural Language Processing Workshop, pp. 72\u201378. Association for Computational Linguistics, Minneapolis, Minnesota (2019). https:\/\/doi.org\/10.18653\/v1\/W19-1909, https:\/\/aclanthology.org\/W19-1909","DOI":"10.18653\/v1\/W19-1909"},{"key":"1_CR3","doi-asserted-by":"publisher","first-page":"101797","DOI":"10.1016\/j.media.2020.101797","volume":"66","author":"A Bustos","year":"2020","unstructured":"Bustos, A., Pertusa, A., Salinas, J.M., de la Iglesia-Vay\u00e1, M.: PadChest: a large chest X-ray image dataset with multi-label annotated reports. Med. Image Anal. 66, 101797 (2020)","journal-title":"Med. Image Anal."},{"issue":"1","key":"1_CR4","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1186\/s12911-020-01362-0","volume":"21","author":"A Casey","year":"2021","unstructured":"Casey, A., et al.: A systematic review of natural language processing applied to radiology reports. BMC Med. Inf. Decis. Making 21(1), 1\u201318 (2021)","journal-title":"BMC Med. Inf. Decis. Making"},{"key":"1_CR5","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"529","DOI":"10.1007\/978-3-030-59713-9_51","volume-title":"Medical Image Computing and Computer Assisted Intervention \u2013 MICCAI 2020","author":"G Chauhan","year":"2020","unstructured":"Chauhan, G., et al.: Joint modeling of chest radiographs and radiology reports for pulmonary edema assessment. In: Martel, A.L., et al. (eds.) MICCAI 2020. LNCS, vol. 12262, pp. 529\u2013539. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-59713-9_51"},{"key":"1_CR6","unstructured":"Chen, T., Kornblith, S., Norouzi, M., Hinton, G.: A simple framework for contrastive learning of visual representations. In: III, H.D., Singh, A. (eds.) Proceedings of the 37th International Conference on Machine Learning, ICML 2020, 13\u201318 July 2020, Virtual Event. Proceedings of Machine Learning Research, vol. 119, pp. 1597\u20131607. PMLR (13\u201318 Jul 2020), http:\/\/proceedings.mlr.press\/v119\/chen20j.html"},{"key":"1_CR7","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"104","DOI":"10.1007\/978-3-030-58577-8_7","volume-title":"Computer Vision \u2013 ECCV 2020","author":"Y-C Chen","year":"2020","unstructured":"Chen, Y.-C.: UNITER: universal image-text representation learning. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12375, pp. 104\u2013120. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58577-8_7"},{"key":"1_CR8","doi-asserted-by":"publisher","unstructured":"Chen, Z., Song, Y., Chang, T.H., Wan, X.: Generating radiology reports via memory-driven transformer. In: Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP). Association for Computational Linguistics (2020). https:\/\/doi.org\/10.18653\/v1\/2020.emnlp-main.112, https:\/\/aclanthology.org\/2020.emnlp-main.112","DOI":"10.18653\/v1\/2020.emnlp-main.112"},{"issue":"10162","key":"1_CR9","doi-asserted-by":"publisher","first-page":"2388","DOI":"10.1016\/S0140-6736(18)31645-3","volume":"392","author":"S Chilamkurthy","year":"2018","unstructured":"Chilamkurthy, S., et al.: Deep learning algorithms for detection of critical findings in head CT scans: a retrospective study. Lancet 392(10162), 2388\u20132396 (2018)","journal-title":"Lancet"},{"issue":"11","key":"1_CR10","doi-asserted-by":"publisher","first-page":"1451","DOI":"10.1109\/TMI.2006.880587","volume":"25","author":"WR Crum","year":"2006","unstructured":"Crum, W.R., Camara, O., Hill, D.L.: Generalized overlap measures for evaluation and validation in medical image analysis. IEEE Trans. Med. Imaging 25(11), 1451\u20131461 (2006)","journal-title":"IEEE Trans. Med. Imaging"},{"key":"1_CR11","doi-asserted-by":"publisher","unstructured":"Dai, S., Wang, Q., Lyu, Y., Zhu, Y.: BDKG at MEDIQA 2021: system report for the radiology report summarization task. In: Proceedings of the 20th Workshop on Biomedical Language Processing, pp. 103\u2013111. Association for Computational Linguistics (2021). https:\/\/doi.org\/10.18653\/v1\/2021.bionlp-1.11, https:\/\/aclanthology.org\/2021.bionlp-1.11","DOI":"10.18653\/v1\/2021.bionlp-1.11"},{"key":"1_CR12","doi-asserted-by":"publisher","unstructured":"Datta, S., Sikka, K., Roy, A., Ahuja, K., Parikh, D., Divakaran, A.: Align2Ground: weakly supervised phrase grounding guided by image-caption alignment. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, ICCV 2019, Seoul, Korea (South), 27 October\u20132 November 2019, pp. 2601\u20132610. IEEE (2019). https:\/\/doi.org\/10.1109\/ICCV.2019.00269","DOI":"10.1109\/ICCV.2019.00269"},{"key":"1_CR13","doi-asserted-by":"publisher","unstructured":"Datta, S., Roberts, K.: A hybrid deep learning approach for spatial trigger extraction from radiology reports. In: Proceedings of the Third International Workshop on Spatial Language Understanding, vol. 2020, pp. 50\u201355. Association for Computational Linguistics (2020). https:\/\/doi.org\/10.18653\/v1\/2020.splu-1.6, https:\/\/aclanthology.org\/2020.splu-1.6","DOI":"10.18653\/v1\/2020.splu-1.6"},{"key":"1_CR14","doi-asserted-by":"publisher","DOI":"10.1016\/j.jbi.2020.103473","volume":"108","author":"S Datta","year":"2020","unstructured":"Datta, S., Si, Y., Rodriguez, L., Shooshan, S.E., Demner-Fushman, D., Roberts, K.: Understanding spatial language in radiology: representation framework, annotation, and spatial relation extraction from chest X-ray reports using deep learning. J. Biomed. Inf. 108, 103473 (2020)","journal-title":"J. Biomed. Inf."},{"issue":"2","key":"1_CR15","doi-asserted-by":"publisher","first-page":"304","DOI":"10.1093\/jamia\/ocv080","volume":"23","author":"D Demner-Fushman","year":"2016","unstructured":"Demner-Fushman, D., et al.: Preparing a collection of radiology examinations for distribution and retrieval. J. Am. Med. Inf. Assoc. 23(2), 304\u2013310 (2016)","journal-title":"J. Am. Med. Inf. Assoc."},{"key":"1_CR16","doi-asserted-by":"crossref","unstructured":"Desai, K., Johnson, J.: VirTex: learning visual representations from textual annotations. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 11162\u201311173 (2021)","DOI":"10.1109\/CVPR46437.2021.01101"},{"key":"1_CR17","doi-asserted-by":"publisher","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. In: Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers), pp. 4171\u20134186. Association for Computational Linguistics, Minneapolis, Minnesota (2019). https:\/\/doi.org\/10.18653\/v1\/N19-1423, https:\/\/aclanthology.org\/N19-1423","DOI":"10.18653\/v1\/N19-1423"},{"issue":"3","key":"1_CR18","doi-asserted-by":"publisher","first-page":"448","DOI":"10.1136\/amiajnl-2013-001766","volume":"21","author":"D Dligach","year":"2014","unstructured":"Dligach, D., Bethard, S., Becker, L., Miller, T., Savova, G.K.: Discovering body site and severity modifiers in clinical texts. J. Am. Med. Inf. Assoc. 21(3), 448\u2013454 (2014)","journal-title":"J. Am. Med. Inf. Assoc."},{"issue":"2","key":"1_CR19","doi-asserted-by":"publisher","first-page":"100019","DOI":"10.1016\/j.patter.2020.100019","volume":"1","author":"JA Dunnmon","year":"2020","unstructured":"Dunnmon, J.A., et al.: Cross-modal data programming enables rapid medical machine learning. Patterns 1(2), 100019 (2020)","journal-title":"Patterns"},{"issue":"7639","key":"1_CR20","doi-asserted-by":"publisher","first-page":"115","DOI":"10.1038\/nature21056","volume":"542","author":"A Esteva","year":"2017","unstructured":"Esteva, A., et al.: Dermatologist-level classification of skin cancer with deep neural networks. Nature 542(7639), 115\u2013118 (2017)","journal-title":"Nature"},{"issue":"1","key":"1_CR21","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1038\/s41467-021-22018-1","volume":"12","author":"S Eyuboglu","year":"2021","unstructured":"Eyuboglu, S., et al.: Multi-task weak supervision enables anatomically-resolved abnormality detection in whole-body FDG-PET\/CT. Nature Commun. 12(1), 1\u201315 (2021)","journal-title":"Nature Commun."},{"key":"1_CR22","doi-asserted-by":"publisher","unstructured":"Fang, H., et al.: From captions to visual concepts and back. In: IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2015, Boston, MA, USA, 7\u201312 June 2015, pp. 1473\u20131482. IEEE Computer Society (2015). https:\/\/doi.org\/10.1109\/CVPR.2015.7298754","DOI":"10.1109\/CVPR.2015.7298754"},{"key":"1_CR23","doi-asserted-by":"crossref","unstructured":"Gao, T., Yao, X., Chen, D.: SimCSE: simple contrastive learning of sentence embeddings. In: Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing, pp. 6894\u20136910 (2021)","DOI":"10.18653\/v1\/2021.emnlp-main.552"},{"issue":"23","key":"1_CR24","doi-asserted-by":"publisher","first-page":"e215","DOI":"10.1161\/01.CIR.101.23.e215","volume":"101","author":"AL Goldberger","year":"2000","unstructured":"Goldberger, A.L., et al.: PhysioBank, PhysioToolkit, and PhysioNet: components of a new research resource for complex physiologic signals. Circulation 101(23), e215\u2013e220 (2000)","journal-title":"Circulation"},{"key":"1_CR25","unstructured":"Graf, B., et al.: Pneumothorax and chest tube classification on chest X-rays for detection of missed pneumothorax. Machine Learning for Health (ML4H) NeurIPS Workshop: Extended Abstract (2020). https:\/\/arxiv.org\/abs\/2011.07353"},{"issue":"1","key":"1_CR26","first-page":"1","volume":"3","author":"Y Gu","year":"2021","unstructured":"Gu, Y., et al.: Domain-specific language model pretraining for biomedical natural language processing. ACM Trans. Comput. Healthc. (HEALTH) 3(1), 1\u201323 (2021)","journal-title":"ACM Trans. Comput. Healthc. (HEALTH)"},{"key":"1_CR27","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"752","DOI":"10.1007\/978-3-030-58580-8_44","volume-title":"Computer Vision \u2013 ECCV 2020","author":"T Gupta","year":"2020","unstructured":"Gupta, T., Vahdat, A., Chechik, G., Yang, X., Kautz, J., Hoiem, D.: Contrastive learning for weakly supervised phrase grounding. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12348, pp. 752\u2013768. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58580-8_44"},{"key":"1_CR28","unstructured":"Hayat, N., Lashen, H., Shamout, F.E.: Multi-label generalized zero shot learning for the classiffcation of disease in chest radiographs. In: Machine Learning for Healthcare Conference, pp. 461\u2013477. PMLR (2021)"},{"key":"1_CR29","doi-asserted-by":"publisher","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 770\u2013778. IEEE Computer Society (2016). https:\/\/doi.org\/10.1109\/CVPR.2016.90","DOI":"10.1109\/CVPR.2016.90"},{"key":"1_CR30","unstructured":"Hsu, T.M.H., Weng, W.H., Boag, W., McDermott, M., Szolovits, P.: Unsupervised multimodal representation learning across medical images and reports. Machine Learning for Health (ML4H) NeurIPS Workshop (2018). https:\/\/arxiv.org\/abs\/1811.08615"},{"key":"1_CR31","doi-asserted-by":"crossref","unstructured":"Huang, S.C., Shen, L., Lungren, M.P., Yeung, S.: GLoRIA: a multimodal global-local representation learning framework for label-efficient medical image recognition. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 3942\u20133951 (2021)","DOI":"10.1109\/ICCV48922.2021.00391"},{"key":"1_CR32","unstructured":"Ioffe, S., Szegedy, C.: Batch normalization: Accelerating deep network training by reducing internal covariate shift. In: International Conference on Machine Learning, pp. 448\u2013456. PMLR (2015)"},{"key":"1_CR33","doi-asserted-by":"publisher","unstructured":"Irvin, J., et al.: CheXpert: a large chest radiograph dataset with uncertainty labels and expert comparison. In: Thirty-Third AAAI Conference on Artificial Intelligence, pp. 590\u2013597. AAAI Press (2019). https:\/\/doi.org\/10.1609\/aaai.v33i01.3301590","DOI":"10.1609\/aaai.v33i01.3301590"},{"key":"1_CR34","unstructured":"Johnson, A., Pollard, T., Berkowitz, S., Mark, R., Horng, S.: MIMIC-CXR database (version 2.0.0). PhysioNet (2019)"},{"issue":"1","key":"1_CR35","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1038\/sdata.2016.35","volume":"3","author":"AE Johnson","year":"2016","unstructured":"Johnson, A.E., et al.: MIMIC-III, a freely accessible critical care database. Sci. Data 3(1), 1\u20139 (2016)","journal-title":"Sci. Data"},{"key":"1_CR36","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"67","DOI":"10.1007\/978-3-319-46478-7_5","volume-title":"Computer Vision \u2013 ECCV 2016","author":"A Joulin","year":"2016","unstructured":"Joulin, A., van der Maaten, L., Jabri, A., Vasilache, N.: Learning visual features from large weakly supervised data. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9911, pp. 67\u201384. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46478-7_5"},{"key":"1_CR37","doi-asserted-by":"publisher","unstructured":"Lanfredi, R.B., et al.: REFLACX, a dataset of reports and eye-tracking data for localization of abnormalities in chest x-rays. arXiv preprint arXiv:2109.14187 (2021). https:\/\/doi.org\/10.13026\/e0dj-8498","DOI":"10.13026\/e0dj-8498"},{"issue":"4","key":"1_CR38","doi-asserted-by":"publisher","first-page":"541","DOI":"10.1162\/neco.1989.1.4.541","volume":"1","author":"Y LeCun","year":"1989","unstructured":"LeCun, Y., et al.: Backpropagation applied to handwritten zip code recognition. Neural Comput. 1(4), 541\u2013551 (1989)","journal-title":"Neural Comput."},{"issue":"4","key":"1_CR39","first-page":"1234","volume":"36","author":"J Lee","year":"2020","unstructured":"Lee, J., et al.: BioBERT: a pre-trained biomedical language representation model for biomedical text mining. Bioinf. 36(4), 1234\u20131240 (2020)","journal-title":"Bioinf."},{"key":"1_CR40","doi-asserted-by":"publisher","unstructured":"Li, A., Jabri, A., Joulin, A., Van Der Maaten, L.: Learning visual n-grams from web data. In: IEEE International Conference on Computer Vision, ICCV 2017, Venice, Italy, 22\u201329 October 2017, pp. 4183\u20134192. IEEE Computer Society (2017). https:\/\/doi.org\/10.1109\/ICCV.2017.449, http:\/\/doi.ieeecomputersociety.org\/10.1109\/ICCV.2017.449","DOI":"10.1109\/ICCV.2017.449"},{"key":"1_CR41","doi-asserted-by":"crossref","unstructured":"Li, G., Duan, N., Fang, Y., Gong, M., Jiang, D.: Unicoder-VL: a universal encoder for vision and language by cross-modal pre-training. In: The Thirty-Fourth AAAI Conference on Artificial Intelligence, AAAI 2020, The Thirty-Second Innovative Applications of Artificial Intelligence Conference, IAAI 2020, The Tenth AAAI Symposium on Educational Advances in Artificial Intelligence, EAAI 2020, New York, NY, USA, 7\u201312 February 2020, vol. 34, no. 7, pp. 11336\u201311344. AAAI Press (2020). https:\/\/aaai.org\/ojs\/index.php\/AAAI\/article\/view\/6795","DOI":"10.1609\/aaai.v34i07.6795"},{"key":"1_CR42","unstructured":"Li, L.H., Yatskar, M., Yin, D., Hsieh, C.J., Chang, K.W.: VisualBERT: a simple and performant baseline for vision and language. arXiv preprint arXiv:1908.03557 (2019)"},{"key":"1_CR43","unstructured":"Li, Y., et al.: Supervision exists everywhere: a data efficient contrastive language-image pre-training paradigm. arXiv preprint arXiv:2110.05208 (2021)"},{"key":"1_CR44","doi-asserted-by":"crossref","unstructured":"Li, Y., Wang, H., Luo, Y.: A comparison of pre-trained vision-and-language models for multimodal representation learning across medical images and reports. In: 2020 IEEE International Conference on Bioinformatics and Biomedicine (BIBM), pp. 1999\u20132004. IEEE (2020)","DOI":"10.1109\/BIBM49941.2020.9313289"},{"key":"1_CR45","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"273","DOI":"10.1007\/978-3-030-87196-3_26","volume-title":"Medical Image Computing and Computer Assisted Intervention \u2013 MICCAI 2021","author":"R Liao","year":"2021","unstructured":"Liao, R., et al.: Multimodal representation learning via maximization of local mutual information. In: de Bruijne, M., et al. (eds.) MICCAI 2021. LNCS, vol. 12902, pp. 273\u2013283. Springer, Cham (2021). https:\/\/doi.org\/10.1007\/978-3-030-87196-3_26"},{"key":"1_CR46","unstructured":"Liu, G., et al.: Clinically accurate chest X-ray report generation. In: Machine Learning for Healthcare Conference, pp. 249\u2013269. PMLR (2019)"},{"key":"1_CR47","unstructured":"Liu, X., et al.: Adversarial training for large neural language models. arXiv preprint arXiv:2004.08994 (2020)"},{"key":"1_CR48","unstructured":"Liu, Y., et al.: RoBERTa: a robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692 (2019)"},{"key":"1_CR49","doi-asserted-by":"crossref","unstructured":"Liu, Y., Wan, B., Ma, L., He, X.: Relation-aware instance refinement for weakly supervised visual grounding. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5612\u20135621 (2021)","DOI":"10.1109\/CVPR46437.2021.00556"},{"key":"1_CR50","unstructured":"Logeswaran, L., Lee, H.: An efficient framework for learning sentence representations. In: 6th International Conference on Learning Representations, ICLR 2018, Vancouver, BC, Canada, 30 April - 3 May 2018, Conference Track Proceedings. OpenReview.net (2018). https:\/\/openreview.net\/forum?id=rJvJXZb0W"},{"key":"1_CR51","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization. In: International Conference on Learning Representations (2018). https:\/\/openreview.net\/forum?id=Bkg6RiCqY7"},{"key":"1_CR52","unstructured":"Lu, J., Batra, D., Parikh, D., Lee, S.: ViLBERT: pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. In: Advances in Neural Information Processing Systems, vol. 32: Annual Conference on Neural Information Processing Systems 2019, NeurIPS 2019 (December), pp. 8\u201314 (2019). Vancouver, BC, Canada, pp. 13\u201323 (2019). https:\/\/proceedings.neurips.cc\/paper\/2019\/hash\/c74d97b01eae257e44aa9d5bade97baf-Abstract.html"},{"key":"1_CR53","doi-asserted-by":"publisher","unstructured":"Mao, J., Huang, J., Toshev, A., Camburu, O., Yuille, A.L., Murphy, K.: Generation and comprehension of unambiguous object descriptions. In: Proceedings of the 2016 IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2016, Las Vegas, NV, USA, 27\u201330 June 2016, pp. 11\u201320. IEEE Computer Society (2016). https:\/\/doi.org\/10.1109\/CVPR.2016.9","DOI":"10.1109\/CVPR.2016.9"},{"key":"1_CR54","doi-asserted-by":"publisher","unstructured":"Miura, Y., Zhang, Y., Tsai, E., Langlotz, C., Jurafsky, D.: Improving factual completeness and consistency of image-to-text radiology report generation. In: Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, pp. 5288\u20135304. Association for Computational Linguistics (2021). https:\/\/doi.org\/10.18653\/v1\/2021.naacl-main.416, https:\/\/aclanthology.org\/2021.naacl-main.416","DOI":"10.18653\/v1\/2021.naacl-main.416"},{"key":"1_CR55","doi-asserted-by":"crossref","unstructured":"Mu, Z., Tang, S., Tan, J., Yu, Q., Zhuang, Y.: Disentangled motif-aware graph learning for phrase grounding. In: AAAI (2021)","DOI":"10.1609\/aaai.v35i15.17602"},{"key":"1_CR56","doi-asserted-by":"crossref","unstructured":"M\u00fcller, P., Kaissis, G., Zou, C., R\u00fcckert, D.: Joint learning of localized representations from medical images and reports. arXiv preprint arXiv:2112.02889 (2021)","DOI":"10.1007\/978-3-031-19809-0_39"},{"key":"1_CR57","doi-asserted-by":"publisher","unstructured":"Nguyen, H.Q., et al.: VinDr-CXR: an open dataset of chest X-rays with radiologist\u2019s annotations. arXiv preprint arXiv:2012.15029 (2020). https:\/\/doi.org\/10.13026\/3akn-b287","DOI":"10.13026\/3akn-b287"},{"key":"1_CR58","unstructured":"Oord, A.v.d., Li, Y., Vinyals, O.: Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748 (2018)"},{"key":"1_CR59","doi-asserted-by":"crossref","unstructured":"Plummer, B.A., Wang, L., Cervantes, C.M., Caicedo, J.C., Hockenmaier, J., Lazebnik, S.: Flickr30k entities: collecting region-to-phrase correspondences for richer image-to-sentence models. In: Proceedings of the IEEE international conference on computer vision, pp. 2641\u20132649 (2015)","DOI":"10.1109\/ICCV.2015.303"},{"key":"1_CR60","unstructured":"Preechakul, K., Piansaddhayanon, C., Naowarat, B., Khandhawit, T., Sriswasdi, S., Chuangsuwanich, E.: Set prediction in the latent space. In: Advances in Neural Information Processing Systems, vol. 34 (2021)"},{"key":"1_CR61","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"key":"1_CR62","doi-asserted-by":"crossref","unstructured":"Rao, Y., et al.: DenseCLIP: language-guided dense prediction with context-aware prompting. arXiv preprint arXiv:2112.01518 (2021)","DOI":"10.1109\/CVPR52688.2022.01755"},{"key":"1_CR63","unstructured":"Redmon, J., Farhadi, A.: YOLOv3: an incremental improvement. arXiv preprint arXiv:1804.02767 (2018)"},{"key":"1_CR64","unstructured":"Ren, S., He, K., Girshick, R., Sun, J.: Faster R-CNN: Towards real-time object detection with region proposal networks. In: Advances in Neural Information Processing Systems, vol. 28: Annual Conference on Neural Information Processing Systems 2015(December), pp. 7\u201312, 2015. Montreal, Quebec, Canada 28, pp. 91\u201399 (2015). https:\/\/proceedings.neurips.cc\/paper\/2015\/hash\/14bfa6bb14875e45bba028a21ed38046-Abstract.html"},{"key":"1_CR65","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"234","DOI":"10.1007\/978-3-319-24574-4_28","volume-title":"Medical Image Computing and Computer-Assisted Intervention \u2013 MICCAI 2015","author":"O Ronneberger","year":"2015","unstructured":"Ronneberger, O., Fischer, P., Brox, T.: U-Net: convolutional networks for biomedical image segmentation. In: Navab, N., Hornegger, J., Wells, W.M., Frangi, A.F. (eds.) MICCAI 2015. LNCS, vol. 9351, pp. 234\u2013241. Springer, Cham (2015). https:\/\/doi.org\/10.1007\/978-3-319-24574-4_28"},{"key":"1_CR66","doi-asserted-by":"crossref","unstructured":"Shih, G., et al.: Augmenting the national institutes of health chest radiograph dataset with expert annotations of possible pneumonia. Radiol.: Artif. Intell. 1(1), e180041 (2019)","DOI":"10.1148\/ryai.2019180041"},{"key":"1_CR67","unstructured":"Shivade, C.: MedNLI - A natural language inference dataset for the clinical domain. PhysioNet (2019)"},{"key":"1_CR68","unstructured":"Simard, P., Steinkraus, D., Platt, J.: Best practices for convolutional neural networks applied to visual document analysis. In: Seventh International Conference on Document Analysis and Recognition, 2003. Proceedings, pp. 958\u2013963. IEEE (2003)"},{"key":"1_CR69","doi-asserted-by":"publisher","unstructured":"Smit, A., Jain, S., Rajpurkar, P., Pareek, A., Ng, A.Y., Lungren, M.: Combining automatic labelers and expert annotations for accurate radiology report labeling using BERT. In: Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP), pp. 1500\u20131519. Association for Computational Linguistics (2020). https:\/\/doi.org\/10.18653\/v1\/2020.emnlp-main.117, https:\/\/aclanthology.org\/2020.emnlp-main.117","DOI":"10.18653\/v1\/2020.emnlp-main.117"},{"key":"1_CR70","unstructured":"Su, W., et al.: VL-BERT: pre-training of generic visual-linguistic representations. In: 8th International Conference on Learning Representations, ICLR 2020, Addis Ababa, Ethiopia, 26\u201330 April 2020. OpenReview.net (2019). https:\/\/openreview.net\/forum?id=SygXPaEYvH"},{"key":"1_CR71","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"45","DOI":"10.1007\/978-3-030-59719-1_5","volume-title":"Medical Image Computing and Computer Assisted Intervention \u2013 MICCAI 2020","author":"LK Tam","year":"2020","unstructured":"Tam, L.K., Wang, X., Turkbey, E., Lu, K., Wen, Y., Xu, D.: Weakly supervised one-stage vision and language disease detection using large scale pneumonia and pneumothorax studies. In: Martel, A.L., et al. (eds.) MICCAI 2020. LNCS, vol. 12264, pp. 45\u201355. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-59719-1_5"},{"issue":"9","key":"1_CR72","doi-asserted-by":"publisher","first-page":"1337","DOI":"10.1038\/s41591-018-0147-y","volume":"24","author":"JJ Titano","year":"2018","unstructured":"Titano, J.J., et al.: Automated deep-neural-network surveillance of cranial images for acute neurologic events. Nat. Med. 24(9), 1337\u20131341 (2018)","journal-title":"Nat. Med."},{"key":"1_CR73","unstructured":"Vaswani, A., et al.: Attention is all you need. In: Advances in Neural Information Processing Systems, vol. 30, pp. 5998\u20136008 (2017). https:\/\/proceedings.neurips.cc\/paper\/2017\/hash\/3f5ee243547dee91fbd053c1c4a845aa-Abstract.html"},{"issue":"11","key":"1_CR74","doi-asserted-by":"publisher","first-page":"1015","DOI":"10.1016\/j.crad.2011.05.013","volume":"66","author":"A Wallis","year":"2011","unstructured":"Wallis, A., McCoubrie, P.: The radiology report-are we getting the message across? Clin. Radiol. 66(11), 1015\u20131022 (2011)","journal-title":"Clin. Radiol."},{"key":"1_CR75","doi-asserted-by":"publisher","unstructured":"Wang, Q., Tan, H., Shen, S., Mahoney, M., Yao, Z.: MAF: multimodal alignment framework for weakly-supervised phrase grounding. In: Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP), pp. 2030\u20132038. Association for Computational Linguistics, Online (2020). https:\/\/doi.org\/10.18653\/v1\/2020.emnlp-main.159, https:\/\/aclanthology.org\/2020.emnlp-main.159","DOI":"10.18653\/v1\/2020.emnlp-main.159"},{"key":"1_CR76","doi-asserted-by":"publisher","unstructured":"Wang, X., Peng, Y., Lu, L., Lu, Z., Bagheri, M., Summers, R.M.: ChestX-Ray8: hospital-scale chest X-ray database and benchmarks on weakly-supervised classification and localization of common thorax diseases. In: 2017 IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2017, Honolulu, HI, USA, 21\u201326 July 2017, pp. 2097\u20132106. IEEE Computer Society (2017). https:\/\/doi.org\/10.1109\/CVPR.2017.369","DOI":"10.1109\/CVPR.2017.369"},{"issue":"7","key":"1_CR77","doi-asserted-by":"publisher","first-page":"33","DOI":"10.37549\/AR1440","volume":"35","author":"JR Wilcox","year":"2006","unstructured":"Wilcox, J.R.: The written radiology report. Appl. Radiol. 35(7), 33 (2006)","journal-title":"Appl. Radiol."},{"key":"1_CR78","unstructured":"Wolf, T., et al.: Huggingface\u2019s transformers: state-of-the-art natural language processing. arXiv preprint arXiv:1910.03771 (2019)"},{"key":"1_CR79","unstructured":"Wu, J.T., et al.: Chest ImaGenome dataset for clinical reasoning. In: Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (Round 2) (2021)"},{"key":"1_CR80","unstructured":"Wu, Y., et al.: Google\u2019s neural machine translation system: bridging the gap between human and machine translation. arXiv preprint arXiv:1609.08144 (2016)"},{"key":"1_CR81","unstructured":"You, Y., Gitman, I., Ginsburg, B.: Large batch training of convolutional networks. arXiv preprint arXiv:1708.03888 (2017)"},{"key":"1_CR82","unstructured":"Yu, F., Koltun, V.: Multi-scale context aggregation by dilated convolutions. In: Bengio, Y., LeCun, Y. (eds.) 4th International Conference on Learning Representations, ICLR 2016, San Juan, Puerto Rico, 2\u20134 May 2016, Conference Track Proceedings (2016). http:\/\/arxiv.org\/abs\/1511.07122"},{"key":"1_CR83","doi-asserted-by":"publisher","unstructured":"Yu, T., et al.: Cross-modal omni interaction modeling for phrase grounding. In: MM 2020: The 28th ACM International Conference on Multimedia, Virtual Event\/Seattle, WA, USA, 12\u201316 October 2020, pp. 1725\u20131734 (2020). https:\/\/doi.org\/10.1145\/3394171.3413846","DOI":"10.1145\/3394171.3413846"},{"key":"1_CR84","doi-asserted-by":"publisher","unstructured":"Zhang, Y., Ding, D.Y., Qian, T., Manning, C.D., Langlotz, C.P.: Learning to summarize radiology findings. In: Proceedings of the Ninth International Workshop on Health Text Mining and Information Analysis, pp. 204\u2013213. Association for Computational Linguistics (2018). https:\/\/doi.org\/10.18653\/v1\/W18-5623, https:\/\/aclanthology.org\/W18-5623","DOI":"10.18653\/v1\/W18-5623"},{"key":"1_CR85","unstructured":"Zhang, Y., Jiang, H., Miura, Y., Manning, C.D., Langlotz, C.P.: Contrastive learning of medical visual representations from paired images and text. arXiv preprint arXiv:2010.00747 (2020)"},{"key":"1_CR86","first-page":"18123","volume":"33","author":"Z Zhang","year":"2020","unstructured":"Zhang, Z., Zhao, Z., Lin, Z., He, X., et al.: Counterfactual contrastive learning for weakly-supervised vision-language grounding. Adv. Neural. Inf. Process. Syst. 33, 18123\u201318134 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2022"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-20059-5_1","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,3,10]],"date-time":"2023-03-10T13:10:58Z","timestamp":1678453858000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-20059-5_1"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022]]},"ISBN":["9783031200588","9783031200595"],"references-count":86,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-20059-5_1","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2022]]},"assertion":[{"value":"29 October 2022","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Tel Aviv","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Israel","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2022","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"23 October 2022","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27 October 2022","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"17","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2022","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2022.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"CMT","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"5804","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"1645","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"28% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.21","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.91","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}