{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,13]],"date-time":"2026-01-13T09:11:11Z","timestamp":1768295471816,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":58,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62376266"],"award-info":[{"award-number":["62376266"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["2022YFB3103800"],"award-info":[{"award-number":["2022YFB3103800"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]},{"name":"the fund of Laboratory for Advanced Computing and Intelligence Engineering"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3680877","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:27Z","timestamp":1729925967000},"page":"2525-2534","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":5,"title":["Focus, Distinguish, and Prompt: Unleashing CLIP for Efficient and Flexible Scene Text Retrieval"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-2696-8549","authenticated-orcid":false,"given":"Gangyan","family":"Zeng","sequence":"first","affiliation":[{"name":"School of Cyber Science and Engineering, Nanjing University of Science and Technology, Nanjing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3783-7974","authenticated-orcid":false,"given":"Yuan","family":"Zhang","sequence":"additional","affiliation":[{"name":"State Key Laboratory of Media Convergence and Communication, Communication University of China, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2016-0901","authenticated-orcid":false,"given":"Jin","family":"Wei","sequence":"additional","affiliation":[{"name":"Lenovo Research, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8628-411X","authenticated-orcid":false,"given":"Dongbao","family":"Yang","sequence":"additional","affiliation":[{"name":"Institute of Information Engineering, Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9518-5914","authenticated-orcid":false,"given":"Peng","family":"Zhang","sequence":"additional","affiliation":[{"name":"School of Cyber Science and Engineering, Nanjing University of Science and Technology &amp; Laboratory for Advanced Computing and Intelligence Engineering, Nanjing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5446-2014","authenticated-orcid":false,"given":"Yiwen","family":"Gao","sequence":"additional","affiliation":[{"name":"School of Cyber Science and Engineering, Nanjing University of Science and Technology, Nanjing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-3130-3220","authenticated-orcid":false,"given":"Xugong","family":"Qin","sequence":"additional","affiliation":[{"name":"School of Cyber Science and Engineering, Nanjing University of Science and Technology, Nanjing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4188-9953","authenticated-orcid":false,"given":"Yu","family":"Zhou","sequence":"additional","affiliation":[{"name":"TMCC, College of Computer Science, Nankai University, Tianjin, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_2_1_1","doi-asserted-by":"crossref","unstructured":"David Aldavert Marcal Rusinol Ricardo Toledo and Josep Llad\u00f3s. 2013. Integrating visual and textual cues for query-by-string word spotting. In ICDAR. 511--515.","DOI":"10.1109\/ICDAR.2013.108"},{"key":"e_1_3_2_2_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2014.2339814"},{"key":"e_1_3_2_2_3_1","volume-title":"Qwen-vl: A frontier large vision-language model with versatile abilities. arXiv preprint arXiv:2308.12966","author":"Bai Jinze","year":"2023","unstructured":"Jinze Bai, Shuai Bai, Shusheng Yang, Shijie Wang, Sinan Tan, Peng Wang, Junyang Lin, Chang Zhou, and Jingren Zhou. 2023. Qwen-vl: A frontier large vision-language model with versatile abilities. arXiv preprint arXiv:2308.12966 (2023)."},{"key":"e_1_3_2_2_4_1","volume-title":"Less is more: Removing text-regions improves clip training efficiency and robustness. arXiv preprint arXiv:2305.05095","author":"Cao Liangliang","year":"2023","unstructured":"Liangliang Cao, Bowen Zhang, Chen Chen, Yinfei Yang, Xianzhi Du, Wencong Zhang, Zhiyun Lu, and Yantao Zheng. 2023. Less is more: Removing text-regions improves clip training efficiency and robustness. arXiv preprint arXiv:2305.05095 (2023)."},{"key":"e_1_3_2_2_5_1","first-page":"935","article-title":"Total-text: A comprehensive dataset for scene text detection and recognition","volume":"1","author":"Ch'ng Chee Kheng","year":"2017","unstructured":"Chee Kheng Ch'ng and Chee Seng Chan. 2017. Total-text: A comprehensive dataset for scene text detection and recognition. In ICDAR, Vol. 1. 935--942.","journal-title":"ICDAR"},{"key":"e_1_3_2_2_6_1","volume-title":"SVTR: Scene Text Recognition with a Single Visual Model. In IJCAI. 884--890.","author":"Du Yongkun","year":"2022","unstructured":"Yongkun Du, Zhineng Chen, Caiyan Jia, Xiaoting Yin, Tianlun Zheng, Chenxia Li, Yuning Du, and Yu-Gang Jiang. 2022. SVTR: Scene Text Recognition with a Single Visual Model. In IJCAI. 884--890."},{"key":"e_1_3_2_2_7_1","volume-title":"UATVR: Uncertainty-adaptive text-video retrieval. In ICCV. 13723--13733.","author":"Fang Bo","year":"2023","unstructured":"Bo Fang, Wenhao Wu, Chang Liu, Yu Zhou, Yuxin Song, Weiping Wang, Xiangbo Shu, Xiangyang Ji, and Jingdong Wang. 2023. UATVR: Uncertainty-adaptive text-video retrieval. In ICCV. 13723--13733."},{"key":"e_1_3_2_2_8_1","volume-title":"DocPedia: Unleashing the Power of Large Multimodal Model in the Frequency Domain for Versatile Document Understanding. arXiv preprint arXiv:2311.11810","author":"Feng Hao","year":"2023","unstructured":"Hao Feng, Qi Liu, Hao Liu, Wengang Zhou, Houqiang Li, and Can Huang. 2023. DocPedia: Unleashing the Power of Large Multimodal Model in the Frequency Domain for Versatile Document Understanding. arXiv preprint arXiv:2311.11810 (2023)."},{"key":"e_1_3_2_2_9_1","doi-asserted-by":"crossref","unstructured":"Suman K Ghosh Lluis Gomez Dimosthenis Karatzas and Ernest Valveny. 2015. Efficient indexing for query by string text retrieval. In ICDAR. 1236--1240.","DOI":"10.1109\/ICDAR.2015.7333961"},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"crossref","unstructured":"Suman K Ghosh and Ernest Valveny. 2015. Query by string word spotting based on character bi-gram indexing. In ICDAR. 881--885.","DOI":"10.1109\/ICDAR.2015.7333888"},{"key":"e_1_3_2_2_11_1","doi-asserted-by":"crossref","unstructured":"Llu\u00eds G\u00f3mez Andr\u00e9s Mafla Marccal Rusinol and Dimosthenis Karatzas. 2018. Single shot scene text retrieval. In ECCV. 700--715.","DOI":"10.1007\/978-3-030-01264-9_43"},{"key":"e_1_3_2_2_12_1","doi-asserted-by":"crossref","unstructured":"Tong He Zhi Tian Weilin Huang Chunhua Shen Yu Qiao and Changming Sun. 2018. An end-to-end textspotter with explicit alignment and attention. In CVPR. 5020--5029.","DOI":"10.1109\/CVPR.2018.00527"},{"key":"e_1_3_2_2_13_1","unstructured":"Anwen Hu Haiyang Xu Jiabo Ye Ming Yan Liang Zhang Bo Zhang Chen Li Ji Zhang Qin Jin Fei Huang et al. 2024. mPLUG-DocOwl 1.5: Unified Structure Learning for OCR-free Document Understanding. arXiv preprint arXiv:2403.12895 (2024)."},{"key":"e_1_3_2_2_14_1","volume-title":"Swintextspotter: Scene text spotting via better synergy between text detection and text recognition. In CVPR. 4593--4603.","author":"Huang Mingxin","year":"2022","unstructured":"Mingxin Huang, Yuliang Liu, Zhenghao Peng, Chongyu Liu, Dahua Lin, Shenggao Zhu, Nicholas Yuan, Kai Ding, and Lianwen Jin. 2022. Swintextspotter: Scene text spotting via better synergy between text detection and text recognition. In CVPR. 4593--4603."},{"key":"e_1_3_2_2_15_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-015-0823-z"},{"key":"e_1_3_2_2_16_1","volume-title":"Adam: A method for stochastic optimization. In ICLR. 4190--4198.","author":"Kingma Diederik P","year":"2014","unstructured":"Diederik P Kingma and Jimmy Ba. 2014. Adam: A method for stochastic optimization. In ICLR. 4190--4198."},{"key":"e_1_3_2_2_17_1","volume-title":"Monkey: Image resolution and text label are important things for large multi-modal models. In CVPR. 26763--26773.","author":"Li Zhang","year":"2024","unstructured":"Zhang Li, Biao Yang, Qiang Liu, Zhiyin Ma, Shuo Zhang, Jingxu Yang, Yabo Sun, Yuliang Liu, and Xiang Bai. 2024. Monkey: Image resolution and text label are important things for large multi-modal models. In CVPR. 26763--26773."},{"key":"e_1_3_2_2_18_1","doi-asserted-by":"crossref","unstructured":"Feng Liang Bichen Wu Xiaoliang Dai Kunpeng Li Yinan Zhao Hang Zhang Peizhao Zhang Peter Vajda and Diana Marculescu. 2023. Open-vocabulary semantic segmentation with mask-adapted clip. In CVPR. 7061--7070.","DOI":"10.1109\/CVPR52729.2023.00682"},{"key":"e_1_3_2_2_19_1","doi-asserted-by":"crossref","unstructured":"Minghui Liao Guan Pang Jing Huang Tal Hassner and Xiang Bai. 2020. Mask textspotter v3: Segmentation proposal network for robust scene text spotting. In ECCV. 706--722.","DOI":"10.1007\/978-3-030-58621-8_41"},{"key":"e_1_3_2_2_20_1","volume-title":"Bin Wang, Weijia Li, and Mike Zheng Shou.","author":"Lin Yiqi","year":"2023","unstructured":"Yiqi Lin, Conghui He, Alex Jinpeng Wang, Bin Wang, Weijia Li, and Mike Zheng Shou. 2023. Parrot Captions Teach CLIP to Spot Text. arXiv preprint arXiv:2312.14232 (2023)."},{"key":"e_1_3_2_2_21_1","volume-title":"Xiaogang Wang, Jifeng Dai, Yu Qiao, and Hongsheng Li.","author":"Lin Ziyi","year":"2022","unstructured":"Ziyi Lin, Shijie Geng, Renrui Zhang, Peng Gao, Gerard De Melo, Xiaogang Wang, Jifeng Dai, Yu Qiao, and Hongsheng Li. 2022. Frozen clip models are efficient video learners. In ECCV. 388--404."},{"key":"e_1_3_2_2_22_1","volume-title":"prompt, and predict: A systematic survey of prompting methods in natural language processing. arXiv preprint arXiv:2107.13586","author":"Liu Pengfei","year":"2021","unstructured":"Pengfei Liu, Weizhe Yuan, Jinlan Fu, Zhengbao Jiang, Hiroaki Hayashi, and Graham Neubig. 2021. Pre-train, prompt, and predict: A systematic survey of prompting methods in natural language processing. arXiv preprint arXiv:2107.13586 (2021)."},{"key":"e_1_3_2_2_23_1","unstructured":"Yuliang Liu Hao Chen Chunhua Shen Tong He Lianwen Jin and Liangwei Wang. 2020. ABCNet: Real-time scene text spotting with adaptive bezier-curve network. In CVPR. 9809--9818."},{"key":"e_1_3_2_2_24_1","unstructured":"Yuliang Liu Zhang Li Hongliang Li Wenwen Yu Mingxin Huang Dezhi Peng Mingyu Liu Mingrui Chen Chunyuan Li Lianwen Jin et al. 2023. On the hidden mystery of ocr in large multimodal models. arXiv preprint arXiv:2305.07895 (2023)."},{"key":"e_1_3_2_2_25_1","unstructured":"Tengchao Lv Yupan Huang Jingye Chen Lei Cui Shuming Ma Yaoyao Chang Shaohan Huang Wenhui Wang Li Dong Weiyao Luo et al. 2023. Kosmos-2.5: A multimodal literate model. arXiv preprint arXiv:2309.11419 (2023)."},{"key":"e_1_3_2_2_26_1","first-page":"107656","article-title":"Real-time lexicon-free scene text retrieval","volume":"110","author":"Mafla Andr\u00e9s","year":"2021","unstructured":"Andr\u00e9s Mafla, Ruben Tito, Sounak Dey, Llu\u00eds G\u00f3mez, Marccal Rusi nol, Ernest Valveny, and Dimosthenis Karatzas. 2021. Real-time lexicon-free scene text retrieval. PR, Vol. 110 (2021), 107656.","journal-title":"PR"},{"key":"e_1_3_2_2_27_1","doi-asserted-by":"crossref","unstructured":"Joanna Materzy'nska Antonio Torralba and David Bau. 2022. Disentangling visual and written concepts in clip. In CVPR. 16410--16419.","DOI":"10.1109\/CVPR52688.2022.01592"},{"key":"e_1_3_2_2_28_1","doi-asserted-by":"crossref","unstructured":"Anand Mishra Karteek Alahari and CV Jawahar. 2013. Image retrieval using textual cues. In ICCV. 3040--3047.","DOI":"10.1109\/ICCV.2013.378"},{"key":"e_1_3_2_2_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDAR.2019.00254"},{"key":"e_1_3_2_2_30_1","doi-asserted-by":"crossref","unstructured":"Zhi Qiao Yu Zhou Jin Wei Wei Wang Yuan Zhang Ning Jiang Hongbin Wang and Weiping Wang. 2021. PIMNet: a parallel iterative and mimicking network for scene text recognition. In ACM MM. 2046--2055.","DOI":"10.1145\/3474085.3475238"},{"key":"e_1_3_2_2_31_1","volume-title":"SEED: Semantics enhanced encoder-decoder framework for scene text recognition. In CVPR. 13528--13537.","author":"Qiao Zhi","year":"2020","unstructured":"Zhi Qiao, Yu Zhou, Dongbao Yang, Yucan Zhou, and Weiping Wang. 2020. SEED: Semantics enhanced encoder-decoder framework for scene text recognition. In CVPR. 13528--13537."},{"key":"e_1_3_2_2_32_1","doi-asserted-by":"crossref","unstructured":"Xugong Qin Pengyuan Lyu Chengquan Zhang Yu Zhou Kun Yao Peng Zhang Hailun Lin and Weiping Wang. 2023. Towards robust real-time scene text detection: From semantic to instance representation learning. In ACM MM. 2025--2034.","DOI":"10.1145\/3581783.3611801"},{"key":"e_1_3_2_2_33_1","doi-asserted-by":"crossref","unstructured":"Xugong Qin Yu Zhou Youhui Guo Dayan Wu Zhihong Tian Ning Jiang Hongbin Wang and Weiping Wang. 2021. Mask is all you need: Rethinking mask r-cnn for dense and arbitrary-shaped scene text detection. In ACM MM. 414--423.","DOI":"10.1145\/3474085.3475178"},{"key":"e_1_3_2_2_34_1","volume-title":"Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al.","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In ICML. 8748--8763."},{"key":"e_1_3_2_2_35_1","volume-title":"Denseclip: Language-guided dense prediction with context-aware prompting. In CVPR. 18082--18091.","author":"Rao Yongming","year":"2022","unstructured":"Yongming Rao, Wenliang Zhao, Guangyi Chen, Yansong Tang, Zheng Zhu, Guan Huang, Jie Zhou, and Jiwen Lu. 2022. Denseclip: Language-guided dense prediction with context-aware prompting. In CVPR. 18082--18091."},{"key":"e_1_3_2_2_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2020.3018491"},{"key":"e_1_3_2_2_37_1","volume-title":"Logoprompt: Synthetic text images can be good visual prompts for vision-language models. In ICCV. 2932--2941.","author":"Shi Cheng","year":"2023","unstructured":"Cheng Shi and Sibei Yang. 2023. Logoprompt: Synthetic text images can be good visual prompts for vision-language models. In ICCV. 2932--2941."},{"key":"e_1_3_2_2_38_1","doi-asserted-by":"crossref","unstructured":"Yan Shu Wei Wang Yu Zhou Shaohui Liu Aoting Zhang Dongbao Yang and Weipinng Wang. 2023. Perceiving ambiguity and semantics without recognition: an efficient and effective ambiguous scene text detector. In ACM MM. 1851--1862.","DOI":"10.1145\/3581783.3612383"},{"key":"e_1_3_2_2_39_1","doi-asserted-by":"crossref","unstructured":"Amanpreet Singh Guan Pang Mandy Toh Jing Huang Wojciech Galuba and Tal Hassner. 2021. TextOCR: Towards large-scale end-to-end reasoning for arbitrary-shaped scene text. In CVPR. 8802--8812.","DOI":"10.1109\/CVPR46437.2021.00869"},{"key":"e_1_3_2_2_40_1","volume-title":"Phocnet: A deep convolutional neural network for word spotting in handwritten documents. In ICFHR. 277--282.","author":"Sudholt Sebastian","year":"2016","unstructured":"Sebastian Sudholt and Gernot A Fink. 2016. Phocnet: A deep convolutional neural network for word spotting in handwritten documents. In ICFHR. 277--282."},{"key":"e_1_3_2_2_41_1","doi-asserted-by":"crossref","unstructured":"Hao Wang Xiang Bai Mingkun Yang Shenggao Zhu Jing Wang and Wenyu Liu. 2021. Scene text retrieval via joint text detection and similarity learning. In CVPR. 4558--4567.","DOI":"10.1109\/CVPR46437.2021.00453"},{"key":"e_1_3_2_2_42_1","doi-asserted-by":"crossref","unstructured":"Kai Wang Boris Babenko and Serge Belongie. 2011. End-to-end scene text recognition. In ICCV. 1457--1464.","DOI":"10.1109\/ICCV.2011.6126402"},{"key":"e_1_3_2_2_43_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i4.16383"},{"key":"e_1_3_2_2_44_1","volume-title":"Tpsnet: Reverse thinking of thin plate splines for arbitrary shape scene text representation. In ACM MM. 5014--5025.","author":"Wang Wei","year":"2022","unstructured":"Wei Wang, Yu Zhou, Jiahao Lv, Dayan Wu, Guoqing Zhao, Ning Jiang, and Weipinng Wang. 2022. Tpsnet: Reverse thinking of thin plate splines for arbitrary shape scene text representation. In ACM MM. 5014--5025."},{"key":"e_1_3_2_2_45_1","doi-asserted-by":"crossref","unstructured":"Zixiao Wang Hongtao Xie Yuxin Wang Jianjun Xu Boqiang Zhang and Yongdong Zhang. 2023. Symmetrical Linguistic Feature Distillation with CLIP for Scene Text Recognition. In ACM MM. 509--518.","DOI":"10.1145\/3581783.3611769"},{"key":"e_1_3_2_2_46_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548051"},{"key":"e_1_3_2_2_47_1","doi-asserted-by":"crossref","unstructured":"Lilong Wen Yingrong Wang Dongxiang Zhang and Gang Chen. 2023. Visual Matching is Enough for Scene Text Retrieval. In WSDM. 447--455.","DOI":"10.1145\/3539597.3570428"},{"key":"e_1_3_2_2_48_1","doi-asserted-by":"crossref","unstructured":"Xiao Yang Dafang He Wenyi Huang Alexander Ororbia Zihan Zhou Daniel Kifer and C Lee Giles. 2017. Smart library: Identifying books on library shelves using supervised deep learning for scene text reading. In JCDL. 1--4.","DOI":"10.1109\/JCDL.2017.7991581"},{"key":"e_1_3_2_2_49_1","volume-title":"Ureader: Universal ocr-free visually-situated language understanding with multimodal large language model. arXiv preprint arXiv:2310.05126","author":"Ye Jiabo","year":"2023","unstructured":"Jiabo Ye, Anwen Hu, Haiyang Xu, Qinghao Ye, Ming Yan, Guohai Xu, Chenliang Li, Junfeng Tian, Qi Qian, Ji Zhang, et al. 2023. Ureader: Universal ocr-free visually-situated language understanding with multimodal large language model. arXiv preprint arXiv:2310.05126 (2023)."},{"key":"e_1_3_2_2_50_1","doi-asserted-by":"crossref","unstructured":"Xu-Cheng Yin Xuwang Yin Kaizhu Huang and Hong-Wei Hao. 2013. Accurate and robust text detection: A step-in for text retrieval in natural scene images. In SIGIR. 1091--1092.","DOI":"10.1145\/2484028.2484197"},{"key":"e_1_3_2_2_51_1","doi-asserted-by":"crossref","unstructured":"Wenwen Yu Yuliang Liu Wei Hua Deqiang Jiang Bo Ren and Xiang Bai. 2023. Turning a CLIP Model into a Scene Text Detector. In CVPR. 6978--6988.","DOI":"10.1109\/CVPR52729.2023.00674"},{"key":"e_1_3_2_2_52_1","doi-asserted-by":"crossref","unstructured":"Gangyan Zeng Yuan Zhang Yu Zhou Bo Fang Guoqing Zhao Xin Wei and Weiping Wang. 2023. Filling in the blank: Rationale-augmented prompt tuning for TextVQA. In ACM MM. 1261--1272.","DOI":"10.1145\/3581783.3612520"},{"key":"e_1_3_2_2_53_1","first-page":"109337","article-title":"Beyond OCR VQA: Towards end-to-end reading and reasoning for robust and accurate textvqa","volume":"138","author":"Zeng Gangyan","year":"2023","unstructured":"Gangyan Zeng, Yuan Zhang, Yu Zhou, Xiaomeng Yang, Ning Jiang, Guoqing Zhao, Weiping Wang, and Xu-Cheng Yin. 2023. Beyond OCR VQA: Towards end-to-end reading and reasoning for robust and accurate textvqa. PR, Vol. 138 (2023), 109337.","journal-title":"PR"},{"key":"e_1_3_2_2_54_1","volume-title":"Exploring Perceptual Limitation of Multimodal Large Language Models. arXiv preprint arXiv:2402.07384","author":"Zhang Jiarui","year":"2024","unstructured":"Jiarui Zhang, Jinyi Hu, Mahyar Khayatkhoei, Filip Ilievski, and Maosong Sun. 2024. Exploring Perceptual Limitation of Multimodal Large Language Models. arXiv preprint arXiv:2402.07384 (2024)."},{"key":"e_1_3_2_2_55_1","volume-title":"CLIP4STR: A Simple Baseline for Scene Text Recognition with Pre-trained Vision-Language Model. arXiv preprint arXiv:2305.14014","author":"Zhao Shuai","year":"2023","unstructured":"Shuai Zhao, Xiaohan Wang, Linchao Zhu, and Yi Yang. 2023. CLIP4STR: A Simple Baseline for Scene Text Recognition with Pre-trained Vision-Language Model. arXiv preprint arXiv:2305.14014 (2023)."},{"key":"e_1_3_2_2_56_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-023-01880-0"},{"key":"e_1_3_2_2_57_1","volume-title":"Chen Change Loy, and Bo Dai","author":"Zhou Chong","year":"2022","unstructured":"Chong Zhou, Chen Change Loy, and Bo Dai. 2022. Extract free dense labels from clip. In ECCV. 696--712."},{"key":"e_1_3_2_2_58_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-022-01653-1"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680877","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3680877","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:18:08Z","timestamp":1750295888000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680877"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":58,"alternative-id":["10.1145\/3664647.3680877","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3680877","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}