{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,10]],"date-time":"2026-04-10T02:23:16Z","timestamp":1775787796116,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":34,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,3,31]],"date-time":"2025-03-31T00:00:00Z","timestamp":1743379200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,3,31]]},"DOI":"10.1145\/3712676.3719267","type":"proceedings-article","created":{"date-parts":[[2025,3,26]],"date-time":"2025-03-26T13:13:23Z","timestamp":1742994803000},"page":"335-341","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["Efficient and Accurate Scene Text Recognition with Cascaded-Transformers"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-0450-0645","authenticated-orcid":false,"given":"Savas","family":"Ozkan","sequence":"first","affiliation":[{"name":"Samsung Research UK, Surrey, United Kingdom"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7189-7260","authenticated-orcid":false,"given":"Andrea","family":"Maracani","sequence":"additional","affiliation":[{"name":"Samsung Research UK, Surrey, United Kingdom"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6217-8731","authenticated-orcid":false,"given":"Mete","family":"Ozay","sequence":"additional","affiliation":[{"name":"Samsung Research UK, Surrey, United Kingdom"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-9599-1095","authenticated-orcid":false,"given":"Hyowon","family":"Kim","sequence":"additional","affiliation":[{"name":"Samsung Electronics, Suwon, Republic of Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-4722-7419","authenticated-orcid":false,"given":"Sijun","family":"Cho","sequence":"additional","affiliation":[{"name":"Samsung Electronics, Suwon, Republic of Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-2049-4222","authenticated-orcid":false,"given":"Eunchung","family":"Noh","sequence":"additional","affiliation":[{"name":"Samsung Electronics, Suwon, Republic of Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8412-5545","authenticated-orcid":false,"given":"Jeongwon","family":"Min","sequence":"additional","affiliation":[{"name":"Samsung Electronics, Suwon, Republic of Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1884-6675","authenticated-orcid":false,"given":"Jung Min","family":"Cho","sequence":"additional","affiliation":[{"name":"Samsung Electronics, Suwon, Republic of Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,3,31]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv","author":"Alexey Dosovitskiy","year":"2010","unstructured":"Dosovitskiy Alexey. 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv: 2010.11929 (2020)."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-86549-8_21"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19815-1_11"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDAR.2019.00252"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"e_1_3_2_1_6_1","first-page":"1","article-title":"Neural architecture search: A survey","volume":"20","author":"Elsken Thomas","year":"2019","unstructured":"Thomas Elsken, Jan Hendrik Metzen, and Frank Hutter. 2019. Neural architecture search: A survey. Journal of Machine Learning Research 20, 55 (2019), 1--21.","journal-title":"Journal of Machine Learning Research"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.254"},{"key":"e_1_3_2_1_8_1","volume-title":"A survey on optical character recognition system. arXiv preprint arXiv:1710.05703","author":"Islam Noman","year":"2017","unstructured":"Noman Islam, Zeeshan Islam, and Nazia Noor. 2017. A survey on optical character recognition system. arXiv preprint arXiv:1710.05703 (2017)."},{"key":"e_1_3_2_1_9_1","volume-title":"Synthetic data and artificial neural networks for natural scene text recognition. arXiv preprint arXiv:1406.2227","author":"Jaderberg Max","year":"2014","unstructured":"Max Jaderberg, Karen Simonyan, Andrea Vedaldi, and Andrew Zisserman. 2014. Synthetic data and artificial neural networks for natural scene text recognition. arXiv preprint arXiv:1406.2227 (2014)."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01878"},{"key":"e_1_3_2_1_11_1","volume-title":"ICDAR 2015 competition on robust reading. In 2015 13th international conference on document analysis and recognition (ICDAR). IEEE, 1156--1160","author":"Karatzas Dimosthenis","year":"2015","unstructured":"Dimosthenis Karatzas, Lluis Gomez-Bigorda, Anguelos Nicolaou, Suman Ghosh, Andrew Bagdanov, Masakazu Iwamura, Jiri Matas, Lukas Neumann, Vijay Ramaseshan Chandrasekhar, Shijian Lu, et al. 2015. ICDAR 2015 competition on robust reading. In 2015 13th international conference on document analysis and recognition (ICDAR). IEEE, 1156--1160."},{"key":"e_1_3_2_1_12_1","volume-title":"ICDAR 2013 robust reading competition. In 2013 12th international conference on document analysis and recognition. IEEE, 1484--1493","author":"Karatzas Dimosthenis","year":"2013","unstructured":"Dimosthenis Karatzas, Faisal Shafait, Seiichi Uchida, Masakazu Iwamura, Lluis Gomez i Bigorda, Sergi Robles Mestre, Joan Mas, David Fernandez Mota, Jon Almazan Almazan, and Lluis Pere De Las Heras. 2013. ICDAR 2013 robust reading competition. In 2013 12th international conference on document analysis and recognition. IEEE, 1484--1493."},{"key":"e_1_3_2_1_13_1","volume-title":"Asian Conference on Machine Learning. PMLR, 379--389","author":"Krylov Ilya","year":"2021","unstructured":"Ilya Krylov, Sergei Nosov, and Vladislav Sovrasov. 2021. Open images v5 text annotation and yet another mask text spotter. In Asian Conference on Machine Learning. PMLR, 379--389."},{"key":"e_1_3_2_1_14_1","volume-title":"Albert: A lite bert for self-supervised learning of language representations. arXiv preprint arXiv:1909.11942","author":"Lan Zhenzhong","year":"2019","unstructured":"Zhenzhong Lan. 2019. Albert: A lite bert for self-supervised learning of language representations. arXiv preprint arXiv:1909.11942 (2019)."},{"key":"e_1_3_2_1_15_1","volume-title":"Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101","author":"Loshchilov I","year":"2017","unstructured":"I Loshchilov. 2017. Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101 (2017)."},{"key":"e_1_3_2_1_16_1","volume-title":"Shortgpt: Layers in large language models are more redundant than you expect. arXiv preprint arXiv:2403.03853","author":"Men Xin","year":"2024","unstructured":"Xin Men, Mingyu Xu, Qingyu Zhang, Bingning Wang, Hongyu Lin, Yaojie Lu, Xianpei Han, and Weipeng Chen. 2024. Shortgpt: Layers in large language models are more redundant than you expect. arXiv preprint arXiv:2403.03853 (2024)."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"crossref","unstructured":"Anand Mishra Karteek Alahari and CV Jawahar. 2012. Scene text recognition using higher order language priors. In BMVC-British machine vision conference. BMVA.","DOI":"10.5244\/C.26.127"},{"key":"e_1_3_2_1_18_1","volume-title":"2019 International conference on document analysis and recognition (ICDAR). IEEE, 1582--1587","author":"Nayef Nibal","year":"2019","unstructured":"Nibal Nayef, Yash Patel, Michal Busta, Pinaki Nath Chowdhury, Dimosthenis Karatzas, Wafa Khlif, Jiri Matas, Umapada Pal, Jean-Christophe Burie, Cheng-lin Liu, et al. 2019. Icdar2019 robust reading challenge on multi-lingual scene text detection and recognition---rrc-mlt-2019. In 2019 International conference on document analysis and recognition (ICDAR). IEEE, 1582--1587."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2013.76"},{"key":"e_1_3_2_1_20_1","volume-title":"Large OCR Model: An Empirical Study of Scaling Law for OCR. arXiv preprint arXiv:2401.00028","author":"Rang Miao","year":"2023","unstructured":"Miao Rang, Zhenni Bi, Chuanjian Liu, Yunhe Wang, and Kai Han. 2023. Large OCR Model: An Empirical Study of Scaling Law for OCR. arXiv preprint arXiv:2401.00028 (2023)."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2014.07.008"},{"key":"e_1_3_2_1_22_1","volume-title":"Icdar 2017 competition on reading chinese text in the wild (rctw-17). In 2017 14th iapr international conference on document analysis and recognition (ICDAR)","author":"Shi Baoguang","unstructured":"Baoguang Shi, Cong Yao, Minghui Liao, Mingkun Yang, Pei Xu, Linyan Cui, Serge Belongie, Shijian Lu, and Xiang Bai. 2017. Icdar 2017 competition on reading chinese text in the wild (rctw-17). In 2017 14th iapr international conference on document analysis and recognition (ICDAR), Vol. 1. IEEE, 1429--1434."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00869"},{"key":"e_1_3_2_1_24_1","volume-title":"Computer Vision-ACCV 2014: 12th Asian Conference on Computer Vision","author":"Su Bolan","year":"2014","unstructured":"Bolan Su and Shijian Lu. 2015. Accurate scene text recognition based on recurrent neural network. In Computer Vision-ACCV 2014: 12th Asian Conference on Computer Vision, Singapore, Singapore, November 1--5, 2014, Revised Selected Papers, Part I 12. Springer, 35--48."},{"key":"e_1_3_2_1_25_1","volume-title":"ICDAR 2019 competition on large-scale street view text with partial labeling-RRC-LSVT. In 2019 International Conference on Document Analysis and Recognition (ICDAR). IEEE, 1557--1562","author":"Sun Yipeng","year":"2019","unstructured":"Yipeng Sun, Zihan Ni, Chee-Kheng Chng, Yuliang Liu, Canjie Luo, Chun Chet Ng, Junyu Han, Errui Ding, Jingtuo Liu, Dimosthenis Karatzas, et al. 2019. ICDAR 2019 competition on large-scale street view text with partial labeling-RRC-LSVT. In 2019 International Conference on Document Analysis and Recognition (ICDAR). IEEE, 1557--1562."},{"key":"e_1_3_2_1_26_1","volume-title":"Coco-text: Dataset and benchmark for text detection and recognition in natural images. arXiv preprint arXiv:1601.07140","author":"Veit Andreas","year":"2016","unstructured":"Andreas Veit, Tomas Matera, Lukas Neumann, Jiri Matas, and Serge Belongie. 2016. Coco-text: Dataset and benchmark for text detection and recognition in natural images. arXiv preprint arXiv:1601.07140 (2016)."},{"key":"e_1_3_2_1_27_1","volume-title":"2011 International conference on computer vision. IEEE, 1457--1464","author":"Wang Kai","year":"2011","unstructured":"Kai Wang, Boris Babenko, and Serge Belongie. 2011. End-to-end scene text recognition. In 2011 International conference on computer vision. IEEE, 1457--1464."},{"key":"e_1_3_2_1_28_1","volume-title":"Proceedings of the 21st international conference on pattern recognition (ICPR2012)","author":"Wang Tao","year":"2012","unstructured":"Tao Wang, David J Wu, Adam Coates, and Andrew Y Ng. 2012. End-to-end text recognition with convolutional neural networks. In Proceedings of the 21st international conference on pattern recognition (ICPR2012). IEEE, 3304--3308."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01393"},{"key":"e_1_3_2_1_30_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 28327--28336","author":"Xu Jianjun","year":"2024","unstructured":"Jianjun Xu, Yuxin Wang, Hongtao Xie, and Yongdong Zhang. 2024. OTE: Exploring Accurate Scene Text Recognition Using One Token. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 28327--28336."},{"key":"e_1_3_2_1_31_1","volume-title":"Proceedings, Part XXIV 16","author":"Zhang Hui","year":"2020","unstructured":"Hui Zhang, Quanming Yao, Mingkun Yang, Yongchao Xu, and Xiang Bai. 2020. AutoSTR: efficient backbone search for scene text recognition. In Computer Vision-ECCV 2020: 16th European Conference, Glasgow, UK, August 23--28, 2020, Proceedings, Part XXIV 16. Springer, 751--767."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDAR.2019.00253"},{"key":"e_1_3_2_1_33_1","volume-title":"SUNw: Scene Understanding Workshop-CVPR","volume":"2017","author":"Zhang Ying","year":"2017","unstructured":"Ying Zhang, Lionel Gueguen, Ilya Zharkov, Peter Zhang, Keith Seifert, and Ben Kadlec. 2017. Uber-text: A large-scale dataset for optical character recognition from street-level imagery. In SUNw: Scene Understanding Workshop-CVPR, Vol. 2017. 5."},{"key":"e_1_3_2_1_34_1","volume-title":"CLIP4STR: A simple baseline for scene text recognition with pre-trained vision-language model. arXiv preprint arXiv:2305.14014","author":"Zhao Shuai","year":"2023","unstructured":"Shuai Zhao, Ruijie Quan, Linchao Zhu, and Yi Yang. 2023. CLIP4STR: A simple baseline for scene text recognition with pre-trained vision-language model. arXiv preprint arXiv:2305.14014 (2023)."}],"event":{"name":"MMSys '25: 16th ACM Multimedia Systems Conference","location":"Stellenbosch South Africa","acronym":"MMSys '25","sponsor":["SIGMM ACM Special Interest Group on Multimedia","SIGCOMM ACM Special Interest Group on Data Communication","SIGMOBILE ACM Special Interest Group on Mobility of Systems, Users, Data and Computing"]},"container-title":["Proceedings of the 16th ACM Multimedia Systems Conference"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3712676.3719267","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:18:38Z","timestamp":1750295918000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3712676.3719267"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,3,31]]},"references-count":34,"alternative-id":["10.1145\/3712676.3719267","10.1145\/3712676"],"URL":"https:\/\/doi.org\/10.1145\/3712676.3719267","relation":{},"subject":[],"published":{"date-parts":[[2025,3,31]]},"assertion":[{"value":"2025-03-31","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}