{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T20:07:13Z","timestamp":1765310833646,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":52,"publisher":"ACM","funder":[{"name":"China National Key Research and Development Program","award":["2022YFC3301702, 2022YFC3301703"],"award-info":[{"award-number":["2022YFC3301702, 2022YFC3301703"]}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["grant no.61771199"],"award-info":[{"award-number":["grant no.61771199"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755101","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:30:51Z","timestamp":1761377451000},"page":"305-314","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["G2LFormer: Global-to-Local Query Enhancement for Robust Table Structure Recognition"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-5802-8852","authenticated-orcid":false,"given":"Haosheng","family":"Cai","sequence":"first","affiliation":[{"name":"South China University of Technology, Guangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1947-4957","authenticated-orcid":false,"given":"Yang","family":"Xue","sequence":"additional","affiliation":[{"name":"South China University of Technology, Guangzhou, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"volume-title":"TableVLM: Multi-modal Pre-training for Table Structure Recognition","author":"Chen Leiyuan","key":"e_1_3_2_1_1_1","unstructured":"Leiyuan Chen, Chengsong Huang, Xiaoqing Zheng, Jinshu Lin, and Xuanjing Huang. 2023. TableVLM: Multi-modal Pre-training for Table Structure Recognition. In ACL. Association for Computational Linguistics, 2437--2449."},{"key":"e_1_3_2_1_2_1","volume-title":"Complicated Table Structure Recognition. CoRR abs\/1908.04729","author":"Chi Zewen","year":"2019","unstructured":"Zewen Chi, Heyan Huang, Heng-Da Xu, Houjin Yu, Wanxuan Yin, and Xianling Mao. 2019. Complicated Table Structure Recognition. CoRR abs\/1908.04729 (2019)."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"crossref","unstructured":"Yuntian Deng David S. Rosenberg and Gideon Mann. 2019. Challenges in End-to-End Neural Scientific Table Recognition. In ICDAR. 894--901.","DOI":"10.1109\/ICDAR.2019.00148"},{"key":"e_1_3_2_1_4_1","volume-title":"RMT: Retentive Networks Meet Vision Transformers. In CVPR. IEEE, 5641--5651","author":"Fan Qihang","year":"2024","unstructured":"Qihang Fan, Huaibo Huang, Mingrui Chen, Hongmin Liu, and Ran He. 2024. RMT: Retentive Networks Meet Vision Transformers. In CVPR. IEEE, 5641--5651."},{"key":"e_1_3_2_1_5_1","volume-title":"TRUST: An Accurate and End-to-End Table structure Recognizer Using Splitting-based Transformers. CoRR abs\/2208.14687","author":"Guo Zengyuan","year":"2022","unstructured":"Zengyuan Guo, Yuechen Yu, Pengyuan Lv, Chengquan Zhang, Haojie Li, Zhihui Wang, Kun Yao, Jingtuo Liu, and Jingdong Wang. 2022. TRUST: An Accurate and End-to-End Table structure Recognizer Using Splitting-based Transformers. CoRR abs\/2208.14687 (2022)."},{"volume-title":"Deep Residual Learning for Image Recognition","author":"He Kaiming","key":"e_1_3_2_1_6_1","unstructured":"Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun. 2016. Deep Residual Learning for Image Recognition. In CVPR. IEEE Computer Society, 770--778."},{"key":"e_1_3_2_1_7_1","volume-title":"PingAn-VCGroup's Solution for ICDAR 2021 Competition on Scientific Table Image Recognition to Latex. CoRR","author":"He Yelin","year":"2021","unstructured":"Yelin He, Xianbiao Qi, Jiaquan Ye, Peng Gao, Yihao Chen, Bingcong Li, Xin Tang, and Rong Xiao. 2021. PingAn-VCGroup's Solution for ICDAR 2021 Competition on Scientific Table Image Recognition to Latex. CoRR (2021)."},{"volume-title":"Improving Table Structure Recognition with Visual-Alignment Sequential Coordinate Modeling","author":"Huang Yongshuai","key":"e_1_3_2_1_8_1","unstructured":"Yongshuai Huang, Ning Lu, Dapeng Chen, Yibo Li, Zecheng Xie, Shenggao Zhu, Liangcai Gao, and Wei Peng. 2023. Improving Table Structure Recognition with Visual-Alignment Sequential Coordinate Modeling. In CVPR. IEEE, 11134--11143."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"crossref","unstructured":"Yupan Huang Tengchao Lv Lei Cui Yutong Lu and Furu Wei. [n.d.]. LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking. In MM. ACM 4083--4091.","DOI":"10.1145\/3503161.3548112"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-70533-5_23"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-70533-5_23"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"crossref","unstructured":"Alex Kendall Yarin Gal and Roberto Cipolla. 2018. Multi-Task Learning Using Uncertainty to Weigh Losses for Scene Geometry and Semantics. In CVPR. 7482--7491.","DOI":"10.1109\/CVPR.2018.00781"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19815-1_29"},{"key":"e_1_3_2_1_14_1","volume-title":"BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension. In ACL. 7871--7880.","author":"Lewis Mike","year":"2020","unstructured":"Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Veselin Stoyanov, and Luke Zettlemoyer. 2020. BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension. In ACL. 7871--7880."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2023.3335410"},{"key":"e_1_3_2_1_16_1","unstructured":"Minghao Li Lei Cui Shaohan Huang FuruWei Ming Zhou and Zhoujun Li. 2020. TableBank: Table Benchmark for Image-based Table Detection and Recognition. In LREC. 1918--1925."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"crossref","unstructured":"Tsung-Yi Lin Priya Goyal Ross B. Girshick Kaiming He and Piotr Doll\u00e1r. 2017. Focal Loss for Dense Object Detection. In ICCV. 2999--3007.","DOI":"10.1109\/ICCV.2017.324"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"crossref","unstructured":"Weihong Lin Zheng Sun Chixiang Ma Mingze Li Jiawei Wang Lei Sun and Qiang Huo. 2022. TSRFormer: Table Structure Recognition with Transformers. In ACM MM. 6473--6482.","DOI":"10.1145\/3503161.3548038"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"crossref","unstructured":"Hao Liu Xin Li Bing Liu Deqiang Jiang Yinsong Liu and Bo Ren. 2022. Neural Collaborative Graph Machines for Table Structure Recognition. In CVPR. 4523--4532.","DOI":"10.1109\/CVPR52688.2022.00449"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"crossref","unstructured":"Hao Liu Xin Li Bing Liu Deqiang Jiang Yinsong Liu Bo Ren and Rongrong Ji. 2021. Show Read and Reason: Table Structure Recognition with Flexible Context Aggregator. In MM. ACM 1084--1092.","DOI":"10.1145\/3474085.3481534"},{"key":"e_1_3_2_1_21_1","unstructured":"Liyuan Liu Haoming Jiang Pengcheng He Weizhu Chen Xiaodong Liu Jianfeng Gao and Jiawei Han. 2020. On the Variance of the Adaptive Learning Rate and Beyond. In ICLR."},{"key":"e_1_3_2_1_22_1","unstructured":"Shilong Liu Feng Li Hao Zhang Xiao Yang Xianbiao Qi Hang Su Jun Zhu and Lei Zhang. 2022. DAB-DETR: Dynamic Anchor Boxes are Better Queries for DETR. In ICLR."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"crossref","unstructured":"Rujiao Long Wen Wang Nan Xue Feiyu Gao Zhibo Yang Yongpan Wang and Gui-Song Xia. 2021. Parsing Table Structures in the Wild. In ICCV. 924--932.","DOI":"10.1109\/ICCV48922.2021.00098"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-41679-8_2"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"crossref","unstructured":"Pengyuan Lyu Weihong Ma Hongyi Wang Yuechen Yu Chengquan Zhang Kun Yao Yang Xue and Jingdong Wang. 2023. GridFormer: Towards Accurate Table Structure Recognition via Grid Prediction. In MM. ACM 7747--7757.","DOI":"10.1145\/3581783.3611961"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2022.109006"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"crossref","unstructured":"Minesh Mathew Dimosthenis Karatzas and C. V. Jawahar. 2021. DocVQA: A Dataset for VQA on Document Images. In WACV. IEEE 2199--2208.","DOI":"10.1109\/WACV48630.2021.00225"},{"key":"e_1_3_2_1_28_1","volume-title":"Martins","author":"Mihaylova Tsvetomila","year":"2019","unstructured":"Tsvetomila Mihaylova and Andr\u00e9 F. T. Martins. 2019. Scheduled Sampling for Transformers. In ACL. 351--356."},{"key":"e_1_3_2_1_29_1","volume-title":"Staar","author":"Nassar Ahmed S.","year":"2022","unstructured":"Ahmed S. Nassar, Nikolaos Livathinos, Maksym Lysak, and Peter W. J. Staar. 2022. TableFormer: Table Structure Understanding with Transformers. In CVPR. 4604--4613."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"crossref","unstructured":"Shubham Singh Paliwal Vishwanath D Rohit Rahul Monika Sharma and Lovekesh Vig. 2019. TableNet: Deep Learning Model for End-to-end Table Detection and Tabular Data Extraction from Scanned Document Images. In ICDAR. 128--133.","DOI":"10.1109\/ICDAR.2019.00029"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-86549-8_7"},{"key":"e_1_3_2_1_32_1","unstructured":"Chunxia Qin Zhenrong Zhang Pengfei Hu Chenyu Liu Jiefeng Ma and Jun Du. [n. d.]. SEMv3: A Fast and Robust Approach to Table Separation Line Detection. In IJCAI. 1191--1199."},{"key":"e_1_3_2_1_33_1","first-page":"70","article-title":"Table Structure Recognition Using Top-Down and Bottom-Up Cues","volume":"12373","author":"Raja Sachin","year":"2020","unstructured":"Sachin Raja, Ajoy Mondal, and C. V. Jawahar. 2020. Table Structure Recognition Using Top-Down and Bottom-Up Cues. In ECCV, Vol. 12373. 70--86.","journal-title":"ECCV"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"crossref","unstructured":"Hamid Rezatofighi Nathan Tsoi JunYoung Gwak Amir Sadeghian Ian D. Reid and Silvio Savarese. 2019. Generalized Intersection Over Union: A Metric and a Loss for Bounding Box Regression. In CVPR. 658--666.","DOI":"10.1109\/CVPR.2019.00075"},{"key":"e_1_3_2_1_35_1","volume-title":"Syed Tahseen Raza Rizvi","author":"Siddiqui Shoaib Ahmed","year":"2019","unstructured":"Shoaib Ahmed Siddiqui, Imran Ali Fateh, Syed Tahseen Raza Rizvi, Andreas Dengel, and Sheraz Ahmed. 2019. DeepTabStR: Deep Learning based Table Structure Recognition. In ICDAR. 1403--1409."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"crossref","unstructured":"Brandon Smock Rohith Pesala and Robin Abraham. 2022. PubTables-1M: Towards comprehensive table extraction from unstructured documents. In CVPR. 4624--4632.","DOI":"10.1109\/CVPR52688.2022.00459"},{"key":"e_1_3_2_1_37_1","unstructured":"TAL Contributors. 2021. TAL_OCR_TABLE: A Scene Table Structure Recognition Benchmark. https:\/\/ai.100tal.com\/dataset."},{"key":"e_1_3_2_1_38_1","unstructured":"Ashish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan N. Gomez Lukasz Kaiser and Illia Polosukhin. 2017. Attention is All you Need. In Neural Information Processing Systems. 5998--6008."},{"key":"e_1_3_2_1_39_1","volume-title":"OMNIPARSER: A Unified Framework for Text Spotting, Key Information Extraction and Table Recognition","author":"Wan Jianqiang","year":"2024","unstructured":"Jianqiang Wan, Sibo Song, Wenwen Yu, Yuliang Liu, Wenqing Cheng, Fei Huang, Xiang Bai, Cong Yao, and Zhibo Yang. 2024. OMNIPARSER: A Unified Framework for Text Spotting, Key Information Extraction and Table Recognition. In CVPR. IEEE, 15641--15653."},{"key":"e_1_3_2_1_40_1","volume-title":"LORE: Logical Location Regression Network for Table Structure Recognition. In AAAI. 2992--3000.","author":"Xing Hangdi","year":"2023","unstructured":"Hangdi Xing, Feiyu Gao, Rujiao Long, Jiajun Bu, Qi Zheng, Liangcheng Li, Cong Yao, and Zhi Yu. 2023. LORE: Logical Location Regression Network for Table Structure Recognition. In AAAI. 2992--3000."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"crossref","unstructured":"Wenyuan Xue Qingyong Li and Dacheng Tao. 2019. ReS2TIM: Reconstruct Syntactic Structures from Table Images. In ICDAR. 749--755.","DOI":"10.1109\/ICDAR.2019.00125"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1038\/s41597-023-01985-8"},{"key":"e_1_3_2_1_43_1","unstructured":"Jiaquan Ye Xianbiao Qi Yelin He Yihao Chen Dengyi Gu Peng Gao and Rong Xiao. 2021. PingAn-VCGroup's Solution for ICDAR 2021 Competition on Scientific Literature Parsing Task B: Table Recognition to HTML. CoRR abs\/2105.01848."},{"key":"e_1_3_2_1_44_1","volume-title":"Gradient Centralization: A New Optimization Technique for Deep Neural Networks. In ECCV","author":"Yong Hongwei","year":"2020","unstructured":"Hongwei Yong, Jianqiang Huang, Xiansheng Hua, and Lei Zhang. 2020. Gradient Centralization: A New Optimization Technique for Deep Neural Networks. In ECCV, Vol. 12346. Springer, 635--652."},{"key":"e_1_3_2_1_45_1","unstructured":"Yuechen Yu Yulin Li Chengquan Zhang Xiaoqiang Zhang Zengyuan Guo Xiameng Qin Kun Yao Junyu Han Errui Ding and Jingdong Wang. 2023. StrucTexTv2: Masked Visual-Textual Prediction for Document Image Pre-training. In ICLR."},{"key":"e_1_3_2_1_46_1","volume-title":"Hinton","author":"Zhang Michael R.","year":"2019","unstructured":"Michael R. Zhang, James Lucas, Jimmy Ba, and Geoffrey E. Hinton. 2019. Lookahead Optimizer: k steps forward, 1 step back. In Neural Information Processing Systems. 9593--9604."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2024.110279"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"crossref","unstructured":"Zhenrong Zhang Shuhang Liu Pengfei Hu Jiefeng Ma Jun Du Jianshu Zhang and Yu Hu. 2024. UniTabNet: Bridging Vision and Language Models for Enhanced Table Structure Recognition. In EMNLP. 6131--6143.","DOI":"10.18653\/v1\/2024.findings-emnlp.355"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2022.108565"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"crossref","unstructured":"Xinyi Zheng Douglas Burdick Lucian Popa Xu Zhong and Nancy Xin RuWang. 2021. Global Table Extractor (GTE): A Framework for Joint Table Identification and Cell Structure Recognition Using Visual Context. In WACV. 697--706.","DOI":"10.1109\/WACV48630.2021.00074"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58589-1_34"},{"key":"e_1_3_2_1_52_1","unstructured":"Xizhou Zhu Weijie Su Lewei Lu Bin Li Xiaogang Wang and Jifeng Dai. 2021. Deformable DETR: Deformable Transformers for End-to-End Object Detection. In ICLR."}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755101","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T20:04:27Z","timestamp":1765310667000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755101"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":52,"alternative-id":["10.1145\/3746027.3755101","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755101","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}