{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:51:05Z","timestamp":1765309865178,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":67,"publisher":"ACM","funder":[{"name":"National Key R&D Program of China","award":["2022YFB3303102"],"award-info":[{"award-number":["2022YFB3303102"]}]},{"name":"Robotics Institute of Zhejiang University","award":["Grant K11808 and K11811"],"award-info":[{"award-number":["Grant K11808 and K11811"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755247","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:26:38Z","timestamp":1761377198000},"page":"1549-1558","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Art4Math: Handwritten Mathematical Expression Recognition via Multimodal Sketch Grounding"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-2853-8351","authenticated-orcid":false,"given":"Yang","family":"Zhou","sequence":"first","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3106-021X","authenticated-orcid":false,"given":"Jin","family":"Wang","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-4659-1300","authenticated-orcid":false,"given":"Yuxiao","family":"Zhang","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1256-153X","authenticated-orcid":false,"given":"Kaixiang","family":"Huang","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2762-9912","authenticated-orcid":false,"given":"Guodong","family":"Lu","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-0572-6612","authenticated-orcid":false,"given":"Jingru","family":"Yang","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3802-4644","authenticated-orcid":false,"given":"Shengfeng","family":"He","sequence":"additional","affiliation":[{"name":"Singapore Management University, Singapore, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Beit: Bert pre-training of image transformers. arXiv preprint arXiv:2106.08254","author":"Bao Hangbo","year":"2021","unstructured":"Hangbo Bao, Li Dong, Songhao Piao, and Furu Wei. 2021. Beit: Bert pre-training of image transformers. arXiv preprint arXiv:2106.08254 (2021)."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00562"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i1.19885"},{"key":"e_1_3_2_1_4_1","volume-title":"International conference on machine learning. PMLR, 1691-1703","author":"Chen Mark","year":"2020","unstructured":"Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, and Ilya Sutskever. 2020b. Generative pretraining from pixels. In International conference on machine learning. PMLR, 1691-1703."},{"key":"e_1_3_2_1_5_1","volume-title":"International conference on machine learning. PmLR, 1597-1607","author":"Chen Ting","year":"2020","unstructured":"Ting Chen, Simon Kornblith, Mohammad Norouzi, and Geoffrey Hinton. 2020a. A simple framework for contrastive learning of visual representations. In International conference on machine learning. PmLR, 1597-1607."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00950"},{"key":"e_1_3_2_1_7_1","volume-title":"International Conference on Machine Learning. PMLR, 980-989","author":"Deng Yuntian","year":"2017","unstructured":"Yuntian Deng, Anssi Kanervisto, Jeffrey Ling, and Alexander M Rush. 2017. Image-to-markup generation with coarse-to-fine attention. In International Conference on Machine Learning. PMLR, 980-989."},{"key":"e_1_3_2_1_8_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)."},{"key":"e_1_3_2_1_9_1","first-page":"4171","volume-title":"Proceedings of the 2019 conference of the North American chapter of the association for computational linguistics: human language technologies","volume":"1","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. Bert: Pre-training of deep bidirectional transformers for language understanding. In Proceedings of the 2019 conference of the North American chapter of the association for computational linguistics: human language technologies, volume 1 (long and short papers). 4171-4186."},{"key":"e_1_3_2_1_10_1","unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly et al. 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)."},{"key":"e_1_3_2_1_11_1","volume-title":"How do humans sketch objects? ACM Transactions on graphics (TOG)","author":"Eitz Mathias","year":"2012","unstructured":"Mathias Eitz, James Hays, and Marc Alexa. 2012. How do humans sketch objects? ACM Transactions on graphics (TOG), Vol. 31, 4 (2012), 1-10."},{"key":"e_1_3_2_1_12_1","volume-title":"SATD: syntax-aware handwritten mathematical expression recognition based on tree-structured transformer decoder. The Visual Computer","author":"Fu Pengbin","year":"2024","unstructured":"Pengbin Fu, Ganyun Xiao, and Huirong Yang. 2024. SATD: syntax-aware handwritten mathematical expression recognition based on tree-structured transformer decoder. The Visual Computer (2024), 1-18."},{"key":"e_1_3_2_1_13_1","volume-title":"Mathwriting: A dataset for handwritten mathematical expression recognition","author":"Gervais Philippe","year":"2024","unstructured":"Philippe Gervais, Asya Fadeeva, and Andrii Maksai. 2024. Mathwriting: A dataset for handwritten mathematical expression recognition, 2024. URL https:\/\/arxiv.org\/abs\/2404.10690 (2024)."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.cosrev.2022.100515"},{"key":"e_1_3_2_1_15_1","volume-title":"Zhaohan Guo, Mohammad Gheshlaghi Azar, et al.","author":"Grill Jean-Bastien","year":"2020","unstructured":"Jean-Bastien Grill, Florian Strub, Florent Altch\u00e9, Corentin Tallec, Pierre Richemond, Elena Buchatskaya, Carl Doersch, Bernardo Avila Pires, Zhaohan Guo, Mohammad Gheshlaghi Azar, et al., 2020. Bootstrap your own latent-a new approach to self-supervised learning. Advances in neural information processing systems, Vol. 33 (2020), 21271-21284."},{"key":"e_1_3_2_1_16_1","volume-title":"European Conference on Computer Vision. Springer, 130-147","author":"Guan Tongkun","year":"2024","unstructured":"Tongkun Guan, Chengyu Lin, Wei Shen, and Xiaokang Yang. 2024. PosFormer: recognizing complex handwritten mathematical expression with position forest transformer. In European Conference on Computer Vision. Springer, 130-147."},{"key":"e_1_3_2_1_17_1","volume-title":"A neural representation of sketch drawings. arXiv preprint arXiv:1704.03477","author":"Ha David","year":"2017","unstructured":"David Ha and Douglas Eck. 2017. A neural representation of sketch drawings. arXiv preprint arXiv:1704.03477 (2017)."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.243"},{"key":"e_1_3_2_1_22_1","volume-title":"Pixel-bert: Aligning image pixels with text by deep multi-modal transformers. arXiv preprint arXiv:2004.00849","author":"Huang Zhicheng","year":"2020","unstructured":"Zhicheng Huang, Zhaoyang Zeng, Bei Liu, Dongmei Fu, and Jianlong Fu. 2020. Pixel-bert: Aligning image pixels with text by deep multi-modal transformers. arXiv preprint arXiv:2004.00849 (2020)."},{"key":"e_1_3_2_1_23_1","volume-title":"International conference on machine learning. PMLR, 5583-5594","author":"Kim Wonjae","year":"2021","unstructured":"Wonjae Kim, Bokyung Son, and Ildoo Kim. 2021. Vilt: Vision-and-language transformer without convolution or region supervision. In International conference on machine learning. PMLR, 5583-5594."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00688"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW50498.2020.00291"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19815-1_12"},{"key":"e_1_3_2_1_27_1","volume-title":"Align before fuse: Vision and language representation learning with momentum distillation. Advances in neural information processing systems","author":"Li Junnan","year":"2021","unstructured":"Junnan Li, Ramprasaath Selvaraju, Akhilesh Gotmare, Shafiq Joty, Caiming Xiong, and Steven Chu Hong Hoi. 2021. Align before fuse: Vision and language representation learning with momentum distillation. Advances in neural information processing systems, Vol. 34 (2021), 9694-9705."},{"key":"e_1_3_2_1_28_1","volume-title":"Sketch-R2CNN: an RNN-rasterization-CNN architecture for vector sketch recognition","author":"Li Lei","year":"2020","unstructured":"Lei Li, Changqing Zou, Youyi Zheng, Qingkun Su, Hongbo Fu, and Chiew-Lan Tai. 2020. Sketch-R2CNN: an RNN-rasterization-CNN architecture for vector sketch recognition. IEEE transactions on visualization and computer graphics, Vol. 27, 9 (2020), 3745-3754."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2023.3260648"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02236"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00679"},{"key":"e_1_3_2_1_32_1","volume-title":"NAMER: Non-Autoregressive Modeling for Handwritten Mathematical Expression Recognition. In European Conference on Computer Vision. Springer, 273-291","author":"Liu Chenyu","year":"2024","unstructured":"Chenyu Liu, Jia Pan, Jinshui Hu, Baocai Yin, Bing Yin, Mingjun Chen, Cong Liu, Jun Du, and Qingfeng Liu. 2024. NAMER: Non-Autoregressive Modeling for Handwritten Mathematical Expression Recognition. In European Conference on Computer Vision. Springer, 273-291."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00376"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-41676-7_9"},{"key":"e_1_3_2_1_35_1","volume-title":"ICFHR 2014 competition on recognition of on-line handwritten mathematical expressions (CROHME 2014). In 2014 14th International Conference on Frontiers in Handwriting Recognition. IEEE, 791-796","author":"Mouchere Harold","year":"2014","unstructured":"Harold Mouchere, Christian Viard-Gaudin, Richard Zanibbi, and Utpal Garain. 2014. ICFHR 2014 competition on recognition of on-line handwritten mathematical expressions (CROHME 2014). In 2014 14th International Conference on Frontiers in Handwriting Recognition. IEEE, 791-796."},{"key":"e_1_3_2_1_36_1","volume-title":"ICFHR2016 CROHME: Competition on recognition of online handwritten mathematical expressions. In 2016 15th International Conference on Frontiers in Handwriting Recognition (ICFHR). IEEE, 607-612","author":"Mouch\u00e8re Harold","year":"2016","unstructured":"Harold Mouch\u00e8re, Christian Viard-Gaudin, Richard Zanibbi, and Utpal Garain. 2016. ICFHR2016 CROHME: Competition on recognition of online handwritten mathematical expressions. In 2016 15th International Conference on Frontiers in Handwriting Recognition (ICFHR). IEEE, 607-612."},{"key":"e_1_3_2_1_37_1","unstructured":"Maxime Oquab Timoth\u00e9e Darcet Th\u00e9o Moutakanni Huy Vo Marc Szafraniec Vasil Khalidov Pierre Fernandez Daniel Haziza Francisco Massa Alaaeldin El-Nouby et al. 2023. Dinov2: Learning robust visual features without supervision. arXiv preprint arXiv:2304.07193 (2023)."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02234"},{"key":"e_1_3_2_1_39_1","volume-title":"Sketchdreamer: Interactive text-augmented creative sketch ideation. arXiv preprint arXiv:2308.14191","author":"Qu Zhiyu","year":"2023","unstructured":"Zhiyu Qu, Tao Xiang, and Yi-Zhe Song. 2023b. Sketchdreamer: Interactive text-augmented creative sketch ideation. arXiv preprint arXiv:2308.14191 (2023)."},{"key":"e_1_3_2_1_40_1","volume-title":"International conference on machine learning. PMLR, 8748-8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al., 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748-8763."},{"key":"e_1_3_2_1_41_1","unstructured":"Alec Radford Karthik Narasimhan Tim Salimans Ilya Sutskever et al. 2018. Improving language understanding by generative pre-training. (2018)."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01416"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00271"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00731"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i2.20136"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICFHR2020.2020.00042"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01481"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDAR.2019.00191"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i3.20172"},{"key":"e_1_3_2_1_50_1","first-page":"18","volume-title":"ECML PKDD 2018, Dublin, Ireland, September 10-14, 2018, Proceedings, Part I 18","author":"Wu Jin-Wen","year":"2019","unstructured":"Jin-Wen Wu, Fei Yin, Yan-Ming Zhang, Xu-Yao Zhang, and Cheng-Lin Liu. 2019. Image-to-markup generation via paired adversarial learning. In Machine learning and knowledge discovery in databases: European conference, ECML PKDD 2018, Dublin, Ireland, September 10-14, 2018, Proceedings, Part I 18. Springer, 18-34."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-020-01291-5"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00943"},{"key":"e_1_3_2_1_53_1","volume-title":"attend and tell: Neural image caption generation with visual attention. arXiv preprint arXiv:1502.03044","author":"Show Kelvin Xu.","year":"2015","unstructured":"Kelvin Xu. 2015. Show, attend and tell: Neural image caption generation with visual attention. arXiv preprint arXiv:1502.03044 (2015)."},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00844"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00994"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-024-02001-1"},{"key":"e_1_3_2_1_57_1","volume-title":"Sketch-a-net: A deep neural network that beats humans. International journal of computer vision","author":"Yu Qian","year":"2017","unstructured":"Qian Yu, Yongxin Yang, Feng Liu, Yi-Zhe Song, Tao Xiang, and Timothy M Hospedales. 2017. Sketch-a-net: A deep neural network that beats humans. International journal of computer vision, Vol. 122 (2017), 411-425."},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00451"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICPR.2018.8546031"},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2018.2844689"},{"key":"e_1_3_2_1_61_1","volume-title":"International Conference on Machine Learning. PMLR, 11076-11085","author":"Zhang Jianshu","year":"2020","unstructured":"Jianshu Zhang, Jun Du, Yongxin Yang, Yi-Zhe Song, Si Wei, and Lirong Dai. 2020. A tree-structured decoder for image-to-markup generation. In International Conference on Machine Learning. PMLR, 11076-11085."},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2017.06.017"},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10097228"},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19815-1_23"},{"key":"e_1_3_2_1_65_1","first-page":"570","volume-title":"Lausanne","author":"Zhao Wenqi","year":"2021","unstructured":"Wenqi Zhao, Liangcai Gao, Zuoyu Yan, Shuai Peng, Lin Du, and Ziyin Zhang. 2021. Handwritten mathematical expression recognition with bidirectionally trained transformer. In Document analysis and recognition-ICDAR 2021: 16th international conference, Lausanne, Switzerland, September 5-10, 2021, proceedings, part II 16. Springer, 570-584."},{"key":"e_1_3_2_1_66_1","volume-title":"Harmonizing visual text comprehension and generation. arXiv preprint arXiv:2407.16364","author":"Zhao Zhen","year":"2024","unstructured":"Zhen Zhao, Jingqun Tang, Binghong Wu, Chunhui Lin, Shu Wei, Hao Liu, Xin Tan, Zhizhong Zhang, Can Huang, and Yuan Xie. 2024. Harmonizing visual text comprehension and generation. arXiv preprint arXiv:2407.16364 (2024)."},{"key":"e_1_3_2_1_67_1","volume-title":"TAMER: Tree-Aware Transformer for Handwritten Mathematical Expression Recognition. arXiv preprint arXiv:2408.08578","author":"Zhu Jianhua","year":"2024","unstructured":"Jianhua Zhu, Wenqi Zhao, Yu Li, Xingjian Hu, and Liangcai Gao. 2024. TAMER: Tree-Aware Transformer for Handwritten Mathematical Expression Recognition. arXiv preprint arXiv:2408.08578 (2024)."}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755247","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:46:57Z","timestamp":1765309617000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755247"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":67,"alternative-id":["10.1145\/3746027.3755247","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755247","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}