{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,5]],"date-time":"2026-05-05T23:06:47Z","timestamp":1778022407755,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":16,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,9,22]],"date-time":"2023-09-22T00:00:00Z","timestamp":1695340800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,9,22]]},"DOI":"10.1145\/3627377.3627442","type":"proceedings-article","created":{"date-parts":[[2023,12,4]],"date-time":"2023-12-04T12:08:25Z","timestamp":1701691705000},"page":"414-418","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":22,"title":["Enhancing Multimodal Understanding with CLIP-Based Image-to-Text Transformation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-0124-1452","authenticated-orcid":false,"given":"Chang","family":"Che","sequence":"first","affiliation":[{"name":"The George Washington University, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-7716-7761","authenticated-orcid":false,"given":"Qunwei","family":"Lin","sequence":"additional","affiliation":[{"name":"Trine University, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-0086-7554","authenticated-orcid":false,"given":"Xinyu","family":"Zhao","sequence":"additional","affiliation":[{"name":"Trine University, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-0291-6092","authenticated-orcid":false,"given":"Jiaxin","family":"Huang","sequence":"additional","affiliation":[{"name":"Trine University, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-4243-1511","authenticated-orcid":false,"given":"Liqiang","family":"Yu","sequence":"additional","affiliation":[{"name":"The University of Chicago, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2023,12,4]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Diabetic Retinopathy Detection with Enhanced Vision Transformers: The Twins-PCPVT Solution. In 2023 IEEE 3rd International Conference on Electronic Technology, Communication and Information (ICETCI)","author":"Dai Weinan","unstructured":"Weinan Dai, Chengjie Mou, Jun Wu, and Xuesong Ye. 2023. Diabetic Retinopathy Detection with Enhanced Vision Transformers: The Twins-PCPVT Solution. In 2023 IEEE 3rd International Conference on Electronic Technology, Communication and Information (ICETCI). IEEE, 403\u2013407."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.7763\/IJCTE.2015.V7.921"},{"key":"e_1_3_2_1_3_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)."},{"key":"e_1_3_2_1_4_1","volume-title":"Multi30k: Multilingual english-german image descriptions. arXiv preprint arXiv:1605.00459","author":"Elliott Desmond","year":"2016","unstructured":"Desmond Elliott, Stella Frank, Khalil Sima\u2019an, and Lucia Specia. 2016. Multi30k: Multilingual english-german image descriptions. arXiv preprint arXiv:1605.00459 (2016)."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.12266"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.494"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"e_1_3_2_1_8_1","volume-title":"Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. Advances in neural information processing systems 32","author":"Lu Jiasen","year":"2019","unstructured":"Jiasen Lu, Dhruv Batra, Devi Parikh, and Stefan Lee. 2019. Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. Advances in neural information processing systems 32 (2019)."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.4249\/scholarpedia.1883"},{"key":"e_1_3_2_1_10_1","volume-title":"International conference on machine learning. PMLR, 8748\u20138763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748\u20138763."},{"key":"e_1_3_2_1_11_1","volume-title":"Unsupervised representation learning with deep convolutional generative adversarial networks. arXiv preprint arXiv:1511.06434","author":"Radford Alec","year":"2015","unstructured":"Alec Radford, Luke Metz, and Soumith Chintala. 2015. Unsupervised representation learning with deep convolutional generative adversarial networks. arXiv preprint arXiv:1511.06434 (2015)."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.131"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1238"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.328"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00060"}],"event":{"name":"ICBDT 2023: 2023 6th International Conference on Big Data Technologies","location":"Qingdao China","acronym":"ICBDT 2023"},"container-title":["Proceedings of the 2023 6th International Conference on Big Data Technologies"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3627377.3627442","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3627377.3627442","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T16:17:28Z","timestamp":1755879448000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3627377.3627442"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,9,22]]},"references-count":16,"alternative-id":["10.1145\/3627377.3627442","10.1145\/3627377"],"URL":"https:\/\/doi.org\/10.1145\/3627377.3627442","relation":{},"subject":[],"published":{"date-parts":[[2023,9,22]]},"assertion":[{"value":"2023-12-04","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}