{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,22]],"date-time":"2025-10-22T00:50:40Z","timestamp":1761094240159,"version":"build-2065373602"},"publisher-location":"New York, NY, USA","reference-count":19,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3728424.3760764","type":"proceedings-article","created":{"date-parts":[[2025,10,21]],"date-time":"2025-10-21T15:17:42Z","timestamp":1761059862000},"page":"52-56","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Vision-Language Models for Automatic Captioning and Cross-Modal Retrieval"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0001-1111-6813","authenticated-orcid":false,"given":"Ikram","family":"Ounadi","sequence":"first","affiliation":[{"name":"University of Klagenfurt, Klagenfurt, Austria"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9218-1704","authenticated-orcid":false,"given":"Klaus","family":"Schoeffmann","sequence":"additional","affiliation":[{"name":"University of Klagenfurt, Klagenfurt, Austria"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,10,26]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"crossref","unstructured":"Shruthi Bannur Stephanie Hyland Qianchu Liu Fernando P\u00e9rez-Garc\u00eda Maximilian Ilse Daniel C. Castro Benedikt Boecking Harshita Sharma Kenza Bouzid Anja Thieme Anton Schwaighofer Maria Wetscherek Matthew P. Lungren Aditya Nori Javier Alvarez-Valle and Ozan Oktay. 2023. Learning to Exploit Temporal Structure for Biomedical Vision-Language Processing. arXiv:2301.04558 [cs.CV] https:\/\/arxiv.org\/abs\/2301.04558","DOI":"10.1109\/CVPR52729.2023.01442"},{"key":"e_1_3_2_1_2_1","unstructured":"Kyunghyun Cho Bart van Merrienboer Caglar Gulcehre Dzmitry Bahdanau Fethi Bougares Holger Schwenk and Yoshua Bengio. 2014. Learning Phrase Representations using RNN Encoder-Decoder for Statistical Machine Translation. arXiv:1406.1078 [cs.CL] https:\/\/arxiv.org\/abs\/1406.1078"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","unstructured":"Jun Fu Wei Zhou Qiuping Jiang Hantao Liu and Guangtao Zhai. 2024. Vision-Language Consistency Guided Multi-modal Prompt Learning for Blind AI Generated Image Quality Assessment. doi:10.48550\/arXiv.2406.16641","DOI":"10.48550\/arXiv.2406.16641"},{"key":"e_1_3_2_1_4_1","unstructured":"Negin Ghamsarian Yosuf El-Shabrawi Sahar Nasirihaghighi Doris Putzgruber-Adamitsch Martin Zinkernagel Sebastian Wolf Klaus Sch\u00f6ffmann and Raphael Sznitman. 2023. Cataract-1K: Cataract Surgery Dataset for Scene Segmentation Phase Recognition and Irregularity Detection. arXiv:2312.06295 [cs.CV] https:\/\/arxiv.org\/abs\/2312.06295"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00391"},{"key":"e_1_3_2_1_6_1","first-page":"102391","article-title":". Surgical Workflow Recognition with Transformer-Based Temporal Models","volume":"78","author":"Yuan Jin","year":"2022","unstructured":"Yuan Jin et al., 2022. Surgical Workflow Recognition with Transformer-Based Temporal Models. Medical Image Analysis, Vol. 78 (2022), 102391.","journal-title":"Medical Image Analysis"},{"key":"e_1_3_2_1_7_1","first-page":"1097","volume-title":"ImageNet Classification with Deep Convolutional Neural Networks. In Advances in Neural Information Processing Systems (NeurIPS)","volume":"25","author":"Krizhevsky Alex","year":"2012","unstructured":"Alex Krizhevsky, Ilya Sutskever, and Geoffrey E. Hinton. 2012. ImageNet Classification with Deep Convolutional Neural Networks. In Advances in Neural Information Processing Systems (NeurIPS), Vol. 25. Curran Associates, Inc., 1097-1105. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2012\/file\/c399862d3b9d6b76c8436e924a68c45b-Paper.pdf"},{"key":"e_1_3_2_1_8_1","volume-title":"Chan Ho So, and Jaewoo Kang","author":"Lee Jinhyuk","year":"2019","unstructured":"Jinhyuk Lee, Wonjin Yoon, Sungdong Kim, Donghyeon Kim, Sunkyu Kim, Chan Ho So, and Jaewoo Kang. 2019. BioBERT: a pre-trained biomedical language representation model for biomedical text mining. CoRR, Vol. abs\/1901.08746 (2019). arXiv:1901.08746 http:\/\/arxiv.org\/abs\/1901.08746"},{"key":"e_1_3_2_1_9_1","volume-title":"BLIP: Bootstrapped Language-Image Pretraining for Unified Vision-Language Understanding and Generation. arXiv preprint arXiv:2201.12086","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022. BLIP: Bootstrapped Language-Image Pretraining for Unified Vision-Language Understanding and Generation. arXiv preprint arXiv:2201.12086 (2022)."},{"key":"e_1_3_2_1_10_1","unstructured":"Suvaditya Mukherjee. 2022. The Annotated ResNet-50. https:\/\/towardsdatascience.com\/the-annotated-resnet-50-a6c536034758"},{"key":"e_1_3_2_1_11_1","volume-title":"An Introduction to Convolutional Neural Networks. arXiv preprint arXiv:1511.08458","author":"O'Shea Keiron","year":"2015","unstructured":"Keiron O'Shea and Ryan Nash. 2015. An Introduction to Convolutional Neural Networks. arXiv preprint arXiv:1511.08458 (2015). https:\/\/arxiv.org\/abs\/1511.08458"},{"key":"e_1_3_2_1_12_1","volume-title":"Chris Hallacy, and Gabriel Goh.","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, and Gabriel Goh. 2021. Learning Transferable Visual Models From Natural Language Supervision. arXiv preprint arXiv:2103.00020 (2021)."},{"key":"e_1_3_2_1_13_1","volume-title":"ACM Multimedia Systems Conference.","author":"Schoeffmann Klaus","year":"2021","unstructured":"Klaus Schoeffmann, Matthias Taschwer, Stephan Sarny, Bernhard M\u00fcnzer, Michael J. Primus, and Daniel Putzgruber. 2021. Cataract-101 Dataset: Video-Based Analysis of Cataract Surgery. In ACM Multimedia Systems Conference."},{"key":"e_1_3_2_1_14_1","volume-title":"Advances in Neural Information Processing Systems","volume":"30","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. In Advances in Neural Information Processing Systems, Vol. 30. https:\/\/arxiv.org\/abs\/1706.03762"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"e_1_3_2_1_16_1","volume-title":"MedCLIP: Contrastive Learning from Unpaired Medical Images and Text. arXiv preprint arXiv:2210.10163","author":"Wang Zifeng","year":"2022","unstructured":"Zifeng Wang, Zhenbang Wu, Dinesh Agarwal, and Jimeng Sun. 2022. MedCLIP: Contrastive Learning from Unpaired Medical Images and Text. arXiv preprint arXiv:2210.10163 (2022). https:\/\/arxiv.org\/abs\/2210.10163"},{"key":"e_1_3_2_1_17_1","unstructured":"Li Yu Situo Wang Wei Zhou and Moncef Gabbouj. 2025. DVLTA-VQA: Decoupled Vision-Language Modeling with Text-Guided Adaptation for Blind Video Quality Assessment. arXiv:2504.11733 [cs.CV] https:\/\/arxiv.org\/abs\/2504.11733"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.5772\/217"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"crossref","unstructured":"Odysseas Zisimopoulos Evangello Flouty Imanol Luengo Petros Giataganas Jean Nehme Andre Chow and Danail Stoyanov. 2018. DeepPhase: Surgical Phase Recognition in CATARACTS Videos. arXiv:1807.10565 [cs.CV] https:\/\/arxiv.org\/abs\/1807.10565","DOI":"10.1007\/978-3-030-00937-3_31"}],"event":{"name":"MM '25:The 33rd ACM International Conference on Multimedia","location":"Dublin Ireland","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 2nd International Workshop on Multimedia Computing for Health and Medicine"],"original-title":[],"deposited":{"date-parts":[[2025,10,21]],"date-time":"2025-10-21T15:18:36Z","timestamp":1761059916000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3728424.3760764"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,26]]},"references-count":19,"alternative-id":["10.1145\/3728424.3760764","10.1145\/3728424"],"URL":"https:\/\/doi.org\/10.1145\/3728424.3760764","relation":{},"subject":[],"published":{"date-parts":[[2025,10,26]]},"assertion":[{"value":"2025-10-26","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}