{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T09:07:45Z","timestamp":1765357665611,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":44,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,8,24]],"date-time":"2024-08-24T00:00:00Z","timestamp":1724457600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,8,25]]},"DOI":"10.1145\/3637528.3671633","type":"proceedings-article","created":{"date-parts":[[2024,8,25]],"date-time":"2024-08-25T04:55:12Z","timestamp":1724561712000},"page":"5690-5700","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["L\n            <scp>umos<\/scp>\n            : Empowering Multimodal LLMs with Scene Text Recognition"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-1401-262X","authenticated-orcid":false,"given":"Ashish","family":"Shenoy","sequence":"first","affiliation":[{"name":"Meta Reality Labs, Redmond, WA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-0366-2327","authenticated-orcid":false,"given":"Yichao","family":"Lu","sequence":"additional","affiliation":[{"name":"Meta Reality Labs, Redmond, WA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9283-6141","authenticated-orcid":false,"given":"Srihari","family":"Jayakumar","sequence":"additional","affiliation":[{"name":"Meta Reality Labs, Menlo Park, CA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2946-3151","authenticated-orcid":false,"given":"Debojeet","family":"Chatterjee","sequence":"additional","affiliation":[{"name":"Meta Reality Labs, Menlo Park, CA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-5731-9146","authenticated-orcid":false,"given":"Mohsen","family":"Moslehpour","sequence":"additional","affiliation":[{"name":"Meta Reality Labs, Menlo Park, CA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5850-7048","authenticated-orcid":false,"given":"Pierce","family":"Chuang","sequence":"additional","affiliation":[{"name":"Meta Reality Labs, Menlo Park, CA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9503-4881","authenticated-orcid":false,"given":"Abhay","family":"Harpale","sequence":"additional","affiliation":[{"name":"Meta Reality Labs, Menlo Park, CA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-7791-2784","authenticated-orcid":false,"given":"Vikas","family":"Bhardwaj","sequence":"additional","affiliation":[{"name":"Meta Reality Labs, Redmond, WA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-7139-3072","authenticated-orcid":false,"given":"Di","family":"Xu","sequence":"additional","affiliation":[{"name":"Reality Labs, Meta, Redmond, WA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-4129-8913","authenticated-orcid":false,"given":"Shicong","family":"Zhao","sequence":"additional","affiliation":[{"name":"Meta Reality Labs, Redmond, WA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-1389-1810","authenticated-orcid":false,"given":"Longfang","family":"Zhao","sequence":"additional","affiliation":[{"name":"Meta Reality Labs, Redmond, WA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2843-2505","authenticated-orcid":false,"given":"Ankit","family":"Ramchandani","sequence":"additional","affiliation":[{"name":"Meta, Menlo Park, CA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-2049-2458","authenticated-orcid":false,"given":"Xin Luna","family":"Dong","sequence":"additional","affiliation":[{"name":"Meta Reality Labs, Redmond, WA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7820-9945","authenticated-orcid":false,"given":"Anuj","family":"Kumar","sequence":"additional","affiliation":[{"name":"Meta Reality Labs, Redmond, WA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2024,8,24]]},"reference":[{"key":"e_1_3_2_2_1_1","unstructured":"[n. d.]. AWS Rekognition. https:\/\/aws.amazon.com\/rekognition\/"},{"key":"e_1_3_2_2_2_1","unstructured":"[n. d.]. Google Cloud OCR. https:\/\/cloud.google.com\/vision\/docs\/ocr"},{"key":"e_1_3_2_2_3_1","unstructured":"OpenAI (2023). 2023. GPT-4 Technical Report. arXiv:2303.08774 [cs.CL]"},{"key":"e_1_3_2_2_4_1","unstructured":"Jean-Baptiste Alayrac Jeff Donahue Pauline Luc Antoine Miech Iain Barr Yana Hasson Karel Lenc Arthur Mensch Katie Millican Malcolm Reynolds Roman Ring Eliza Rutherford Serkan Cabi Tengda Han Zhitao Gong Sina Samangooei Marianne Monteiro Jacob Menick Sebastian Borgeaud Andrew Brock Aida Nematzadeh Sahand Sharifzadeh Mikolaj Binkowski Ricardo Barreira Oriol Vinyals Andrew Zisserman and Karen Simonyan. 2022. Flamingo: a Visual Language Model for Few-Shot Learning. arXiv:2204.14198 [cs.CV]"},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.279"},{"key":"e_1_3_2_2_6_1","unstructured":"Anas Awadalla Irena Gao Josh Gardner Jack Hessel Yusuf Hanafy Wanrong Zhu Kalyani Marathe Yonatan Bitton Samir Gadre Shiori Sagawa Jenia Jitsev Simon Kornblith PangWei Koh Gabriel Ilharco Mitchell Wortsman and Ludwig Schmidt. 2023. OpenFlamingo: An Open-Source Framework for Training Large Autoregressive Vision-Language Models. arXiv:2308.01390 [cs.CV]"},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU51503.2021.9688109"},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/3219819.3219861"},{"key":"e_1_3_2_2_9_1","volume-title":"Lin (Eds.)","volume":"33","author":"Cubuk Ekin Dogus","year":"2020","unstructured":"Ekin Dogus Cubuk, Barret Zoph, Jon Shlens, and Quoc Le. 2020. RandAugment: Practical Automated Data Augmentation with a Reduced Search Space. In Advances in Neural Information Processing Systems, H. Larochelle, M. Ranzato, R. Hadsell, M.F. Balcan, and H. Lin (Eds.), Vol. 33. Curran Associates, Inc., 18613--18624."},{"key":"e_1_3_2_2_10_1","volume-title":"Ravi Teja Gadde, and Katrin Kirchhoff","author":"Dingliwal Saket","year":"2021","unstructured":"Saket Dingliwal, Ashish Shenoy, Sravan Bodapati, Ankur Gandhe, Ravi Teja Gadde, and Katrin Kirchhoff. 2021. Efficient domain adaptation of language models in ASR systems using Prompt-tuning. CoRR abs\/2110.06502 (2021). arXiv:2110.06502 https:\/\/arxiv.org\/abs\/2110.06502"},{"key":"e_1_3_2_2_11_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-824"},{"key":"e_1_3_2_2_12_1","unstructured":"Yuning Du Chenxia Li Ruoyu Guo Xiaoting Yin Weiwei Liu Jun Zhou Yifan Bai Zilin Yu Yehua Yang Qingqing Dang and Haoshuang Wang. 2020. PP-OCR: A Practical Ultra Lightweight OCR System. arXiv:2009.09941 [cs.CV]"},{"key":"e_1_3_2_2_13_1","unstructured":"Hao Feng Zijian Wang Jingqun Tang Jinghui Lu Wengang Zhou Houqiang Li and Can Huang. 2023. UniDoc: A Universal Large Multimodal Model for Simultaneous Text Detection Recognition Spotting and Understanding. arXiv:2308.11592 [cs.AI]"},{"key":"e_1_3_2_2_14_1","unstructured":"Kaiming He Georgia Gkioxari Piotr Doll\u00e1r and Ross Girshick. 2018. Mask R-CNN. arXiv:1703.06870 [cs.CV]"},{"key":"e_1_3_2_2_15_1","volume-title":"BLIVA: A Simple Multimodal LLM for Better Handling of Text-Rich Visual Questions. arXiv:2308.09936 [cs.CV]","author":"Hu Wenbo","year":"2023","unstructured":"Wenbo Hu, Yifan Xu, Yi Li, Weiyue Li, Zeyuan Chen, and Zhuowen Tu. 2023. BLIVA: A Simple Multimodal LLM for Better Handling of Text-Rich Visual Questions. arXiv:2308.09936 [cs.CV]"},{"key":"e_1_3_2_2_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDAR.2019.00244"},{"key":"e_1_3_2_2_17_1","volume-title":"DISGO: Automatic End-to-End Evaluation for Scene Text OCR. arXiv:2308.13173 [cs.CV]","author":"Hwang Mei-Yuh","year":"2023","unstructured":"Mei-Yuh Hwang, Yangyang Shi, Ankit Ramchandani, Guan Pang, Praveen Krishnan, Lucas Kabela, Frank Seide, Samyak Datta, and Jun Liu. 2023. DISGO: Automatic End-to-End Evaluation for Scene Text OCR. arXiv:2308.13173 [cs.CV]"},{"key":"e_1_3_2_2_18_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-015-0823-z"},{"key":"e_1_3_2_2_19_1","unstructured":"Minghui Liao Pengyuan Lyu Minghang He Cong Yao Wenhao Wu and Xiang Bai. 2019. Mask TextSpotter: An End-to-End Trainable Neural Network for Spotting Text with Arbitrary Shapes. arXiv:1908.08207 [cs.CV]"},{"key":"e_1_3_2_2_20_1","unstructured":"Haotian Liu Chunyuan Li Qingyang Wu and Yong Jae Lee. 2023. Visual Instruction Tuning. arXiv:2304.08485 [cs.CV]"},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"crossref","unstructured":"Shu Liu Lu Qi Haifang Qin Jianping Shi and Jiaya Jia. 2018. Path Aggregation Network for Instance Segmentation. arXiv:1803.01534 [cs.CV]","DOI":"10.1109\/CVPR.2018.00913"},{"key":"e_1_3_2_2_22_1","volume-title":"Lianwen Jin, and Xiang Bai.","author":"Liu Yuliang","year":"2023","unstructured":"Yuliang Liu, Zhang Li, Hongliang Li, Wenwen Yu, Yang Liu, Biao Yang, Mingxin Huang, Dezhi Peng, Mingyu Liu, Mingrui Chen, Chunyuan Li, Xucheng Yin, Cheng lin Liu, Lianwen Jin, and Xiang Bai. 2023. On the Hidden Mystery of OCR in Large Multimodal Models. arXiv:2305.07895 [cs.CV]"},{"key":"e_1_3_2_2_23_1","unstructured":"Shangbang Long Siyang Qin Dmitry Panteleev Alessandro Bissacco Yasuhisa Fujii and Michalis Raptis. 2022. Towards End-to-End Unified Scene Text Detection and Layout Analysis. arXiv:2203.15143 [cs.CV]"},{"key":"e_1_3_2_2_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022"},{"key":"e_1_3_2_2_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/tmm.2018.2818020"},{"key":"e_1_3_2_2_26_1","volume-title":"DocVQA: A Dataset for VQA on Document Images. In 2021 IEEE Winter Conference on Applications of Computer Vision (WACV). 2199--2208","author":"Mathew Minesh","year":"2021","unstructured":"Minesh Mathew, Dimosthenis Karatzas, and C. V. Jawahar. 2021. DocVQA: A Dataset for VQA on Document Images. In 2021 IEEE Winter Conference on Applications of Computer Vision (WACV). 2199--2208. https:\/\/doi.org\/10.1109\/ WACV48630.2021.00225"},{"key":"e_1_3_2_2_27_1","volume-title":"STRIDE: Scene Text Recognition In-Device. In 2021 International Joint Conference on Neural Networks (IJCNN). 1--8. https:\/\/doi.org\/10","author":"Munjal Rachit S","year":"2021","unstructured":"Rachit S Munjal, Arun D Prabhu, Nikhil Arora, Sukumar Moharana, and Gopi Ramena. 2021. STRIDE: Scene Text Recognition In-Device. In 2021 International Joint Conference on Neural Networks (IJCNN). 1--8. https:\/\/doi.org\/10.1109\/ IJCNN52387.2021.9534319"},{"key":"e_1_3_2_2_28_1","volume-title":"Yelysei Bondarenko, Mart van Baalen, and Tijmen Blankevoort.","author":"Nagel Markus","year":"2021","unstructured":"Markus Nagel, Marios Fournarakis, Rana Ali Amjad, Yelysei Bondarenko, Mart van Baalen, and Tijmen Blankevoort. 2021. A White Paper on Neural Network Quantization. arXiv:2106.08295 [cs.LG]"},{"key":"e_1_3_2_2_29_1","unstructured":"Shaoqing Ren Kaiming He Ross Girshick and Jian Sun. 2016. Faster RCNN: Towards Real-Time Object Detection with Region Proposal Networks. arXiv:1506.01497 [cs.CV]"},{"key":"e_1_3_2_2_30_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1"},{"key":"e_1_3_2_2_31_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1849"},{"key":"e_1_3_2_2_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2016.2646371"},{"key":"e_1_3_2_2_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.452"},{"key":"e_1_3_2_2_34_1","unstructured":"Yongxin Shi Dezhi Peng Wenhui Liao Zening Lin Xinhong Chen Chongyu Liu Yuyi Zhang and Lianwen Jin. 2023. Exploring OCR Capabilities of GPT-4V(ision): A Quantitative and In-depth Evaluation. arXiv:2310.16809 [cs.CV]"},{"key":"e_1_3_2_2_35_1","doi-asserted-by":"crossref","unstructured":"Amanpreet Singh Guan Pang Mandy Toh Jing Huang Wojciech Galuba and Tal Hassner. 2021. TextOCR: Towards large-scale end-to-end reasoning for arbitraryshaped scene text. arXiv:2105.05486 [cs.CV]","DOI":"10.1109\/CVPR46437.2021.00869"},{"key":"e_1_3_2_2_36_1","volume-title":"Gemini: A Family of Highly Capable Multimodal Models. arXiv:2312.11805 [cs.CL]","author":"Team Gemini","year":"2023","unstructured":"Gemini Team, Rohan Anil, Sebastian Borgeaud, Yonghui Wu, Jean-Baptiste Alayrac, Jiahui Yu, Radu Soricut, Johan Schalkwyk, Andrew M. Dai, and et al. 2023. Gemini: A Family of Highly Capable Multimodal Models. arXiv:2312.11805 [cs.CL]"},{"key":"e_1_3_2_2_37_1","volume-title":"FCOS: Fully Convolutional One-Stage Object Detection. arXiv:1904.01355 [cs.CV]","author":"Tian Zhi","year":"2019","unstructured":"Zhi Tian, Chunhua Shen, Hao Chen, and Tong He. 2019. FCOS: Fully Convolutional One-Stage Object Detection. arXiv:1904.01355 [cs.CV]"},{"key":"e_1_3_2_2_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01298"},{"key":"e_1_3_2_2_39_1","doi-asserted-by":"crossref","unstructured":"Huiyu Wang Yukun Zhu Hartwig Adam Alan Yuille and Liang-Chieh Chen. 2021. MaX-DeepLab: End-to-End Panoptic Segmentation with Mask Transformers. arXiv:2012.00759 [cs.CV]","DOI":"10.1109\/CVPR46437.2021.00542"},{"key":"e_1_3_2_2_40_1","doi-asserted-by":"publisher","DOI":"10.5555\/1886063.1886108"},{"key":"e_1_3_2_2_41_1","unstructured":"Xue Yang Xiaojiang Yang Jirui Yang Qi Ming Wentao Wang Qi Tian and Junchi Yan. 2022. Learning High-Precision Bounding Box for Rotated Object Detection via Kullback-Leibler Divergence. arXiv:2106.01883 [cs.CV]"},{"key":"e_1_3_2_2_42_1","volume-title":"Xin Alex Lin, and Fei Huang","author":"Ye Jiabo","year":"2023","unstructured":"Jiabo Ye, Anwen Hu, Haiyang Xu, Qinghao Ye, Ming Yan, Guohai Xu, Chenliang Li, Junfeng Tian, Qi Qian, Ji Zhang, Qin Jin, Liang He, Xin Alex Lin, and Fei Huang. 2023. UReader: Universal OCR-free Visually-situated Language Understanding with Multimodal Large Language Model. arXiv:2310.05126 [cs.CV]"},{"key":"e_1_3_2_2_43_1","doi-asserted-by":"crossref","unstructured":"Haoyang Zhang Ying Wang Feras Dayoub and Niko S\u00fcnderhauf. 2021. VarifocalNet: An IoU-aware Dense Object Detector. arXiv:2008.13367 [cs.CV]","DOI":"10.1109\/CVPR46437.2021.00841"},{"key":"e_1_3_2_2_44_1","unstructured":"Deyao Zhu Jun Chen Xiaoqian Shen Xiang Li and Mohamed Elhoseiny. 2023. MiniGPT-4: Enhancing Vision-Language Understanding with Advanced Large Language Models. arXiv:2304.10592 [cs.CV]"}],"event":{"name":"KDD '24: The 30th ACM SIGKDD Conference on Knowledge Discovery and Data Mining","sponsor":["SIGMOD ACM Special Interest Group on Management of Data","SIGKDD ACM Special Interest Group on Knowledge Discovery in Data"],"location":"Barcelona Spain","acronym":"KDD '24"},"container-title":["Proceedings of the 30th ACM SIGKDD Conference on Knowledge Discovery and Data Mining"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3637528.3671633","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3637528.3671633","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T00:05:59Z","timestamp":1750291559000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3637528.3671633"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,8,24]]},"references-count":44,"alternative-id":["10.1145\/3637528.3671633","10.1145\/3637528"],"URL":"https:\/\/doi.org\/10.1145\/3637528.3671633","relation":{},"subject":[],"published":{"date-parts":[[2024,8,24]]},"assertion":[{"value":"2024-08-24","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}