{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,16]],"date-time":"2026-06-16T04:54:03Z","timestamp":1781585643766,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":43,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["No. 62272227"],"award-info":[{"award-number":["No. 62272227"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"MUR PNRR project FAIR","award":["PE00000013"],"award-info":[{"award-number":["PE00000013"]}]},{"name":"EU Horizon projects ELIAS","award":["No. 101120237"],"award-info":[{"award-number":["No. 101120237"]}]},{"name":"ELLIOT","award":["No. 101214398"],"award-info":[{"award-number":["No. 101214398"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755751","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T06:55:00Z","timestamp":1761375300000},"page":"5100-5109","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["AlignCAT: Visual-Linguistic Alignment of Category and Attribute for Weakly Supervised Visual Grounding"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0007-9439-5291","authenticated-orcid":false,"given":"Yidan","family":"Wang","sequence":"first","affiliation":[{"name":"Nanjing University of Aeronautics and Astronautics, Nanjing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-0700-5645","authenticated-orcid":false,"given":"Chenyi","family":"Zhuang","sequence":"additional","affiliation":[{"name":"University of Trento, Trento, Italy"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-5376-5289","authenticated-orcid":false,"given":"Wutao","family":"Liu","sequence":"additional","affiliation":[{"name":"Nanjing University of Aeronautics and Astronautics, Nanjing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4492-5430","authenticated-orcid":false,"given":"Pan","family":"Gao","sequence":"additional","affiliation":[{"name":"Nanjing University of Aeronautics and Astronautics, Nanjing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6597-7248","authenticated-orcid":false,"given":"Nicu","family":"Sebe","sequence":"additional","affiliation":[{"name":"University of Trento, Trento, Italy"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00182"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00425"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681058"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00135"},{"key":"e_1_3_2_1_5_1","volume-title":"SimVG: A Simple Framework for Visual Grounding with Decoupled Multi-modal Fusion. arXiv preprint arXiv:2409.17531","author":"Dai Ming","year":"2024","unstructured":"Ming Dai, Lingfeng Yang, Yihao Xu, Zhenhua Feng, and Wankou Yang. 2024. SimVG: A Simple Framework for Visual Grounding with Decoupled Multi-modal Fusion. arXiv preprint arXiv:2409.17531 (2024)."},{"key":"e_1_3_2_1_6_1","volume-title":"Adam: A method for stochastic optimization. (No Title)","author":"Diederik P Kingma","year":"2014","unstructured":"P Kingma Diederik. 2014. Adam: A method for stochastic optimization. (No Title) (2014)."},{"key":"e_1_3_2_1_7_1","volume-title":"European Conference on Computer Vision. Springer, 326-342","author":"Eiras Francisco","year":"2024","unstructured":"Francisco Eiras, Kemal Oksuz, Adel Bibi, Philip HS Torr, and Puneet K Dokania. 2024. Segment, select, correct: A framework for weakly-supervised referring segmentation. In European Conference on Computer Vision. Springer, 326-342."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01507"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00263"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01425"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01999"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01999"},{"key":"e_1_3_2_1_13_1","volume-title":"Align before fuse: Vision and language representation learning with momentum distillation. Advances in neural information processing systems","author":"Li Junnan","year":"2021","unstructured":"Junnan Li, Ramprasaath Selvaraju, Akhilesh Gotmare, Shafiq Joty, Caiming Xiong, and Steven Chu Hong Hoi. 2021. Align before fuse: Vision and language representation learning with momentum distillation. Advances in neural information processing systems, Vol. 34 (2021), 9694-9705."},{"key":"e_1_3_2_1_14_1","first-page":"740","volume-title":"Zurich","author":"Lin Tsung-Yi","year":"2014","unstructured":"Tsung-Yi Lin, Michael Maire, Serge Belongie, James Hays, Pietro Perona, Deva Ramanan, Piotr Doll\u00e1r, and C Lawrence Zitnick. 2014. Microsoft coco: Common objects in context. In Computer Vision-ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6-12, 2014, Proceedings, Part V 13. Springer, 740-755."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01469"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.02022"},{"key":"e_1_3_2_1_17_1","first-page":"3003","article-title":"Entity-enhanced adaptive reconstruction network for weakly supervised referring expression grounding","volume":"45","author":"Liu Xuejing","year":"2022","unstructured":"Xuejing Liu, Liang Li, Shuhui Wang, Zheng-Jun Zha, Zechao Li, Qi Tian, and Qingming Huang. 2022. Entity-enhanced adaptive reconstruction network for weakly supervised referring expression grounding. IEEE Transactions on Pattern Analysis and Machine Intelligence, Vol. 45, 3 (2022), 3003-3018.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00270"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3351074"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3351074"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00265"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01005"},{"key":"e_1_3_2_1_23_1","volume-title":"APL: Anchor-Based Prompt Learning for One-Stage Weakly Supervised Referring Expression Comprehension. In European Conference on Computer Vision. Springer, 198-215","author":"Luo Yaxin","year":"2025","unstructured":"Yaxin Luo, Jiayi Ji, Xiaofu Chen, Yuxin Zhang, Tianhe Ren, and Gen Luo. 2025. APL: Anchor-Based Prompt Learning for One-Stage Weakly Supervised Referring Expression Comprehension. In European Conference on Computer Vision. Springer, 198-215."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.9"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00331"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48891.2023.10161294"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46493-0_48"},{"key":"e_1_3_2_1_28_1","volume-title":"Variational context: Exploiting visual and textual context for grounding referring expressions","author":"Niu Yulei","year":"2019","unstructured":"Yulei Niu, Hanwang Zhang, Zhiwu Lu, and Shih-Fu Chang. 2019. Variational context: Exploiting visual and textual context for grounding referring expressions. IEEE transactions on pattern analysis and machine intelligence, Vol. 43, 1 (2019), 347-359."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i2.20108"},{"key":"e_1_3_2_1_30_1","first-page":"28222","article-title":"What is where by looking: Weakly-supervised open-world phrase-grounding without text inputs","volume":"35","author":"Shaharabany Tal","year":"2022","unstructured":"Tal Shaharabany, Yoad Tewel, and Lior Wolf. 2022. What is where by looking: Weakly-supervised open-world phrase-grounding without text inputs. Advances in Neural Information Processing Systems, Vol. 35 (2022), 28222-28237.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_31_1","volume-title":"From show to tell: A survey on deep learning-based image captioning","author":"Stefanini Matteo","year":"2022","unstructured":"Matteo Stefanini, Marcella Cornia, Lorenzo Baraldi, Silvia Cascianelli, Giuseppe Fiameni, and Rita Cucchiara. 2022. From show to tell: A survey on deep learning-based image captioning. IEEE transactions on pattern analysis and machine intelligence, Vol. 45, 1 (2022), 539-559."},{"key":"e_1_3_2_1_32_1","volume-title":"Weakly-supervised segmentation of referring expressions. arXiv preprint arXiv:2205.04725","author":"Strudel Robin","year":"2022","unstructured":"Robin Strudel, Ivan Laptev, and Cordelia Schmid. 2022. Weakly-supervised segmentation of referring expressions. arXiv preprint arXiv:2205.04725 (2022)."},{"key":"e_1_3_2_1_33_1","volume-title":"Si Liu, and John Y Goulermas.","author":"Sun Mingjie","year":"2021","unstructured":"Mingjie Sun, Jimin Xiao, Eng Gee Lim, Si Liu, and John Y Goulermas. 2021b. Discriminative triad matching and reconstruction for weakly referring expression grounding. IEEE transactions on pattern analysis and machine intelligence, Vol. 43, 11 (2021), 4189-4195."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2021.3139467"},{"key":"e_1_3_2_1_35_1","volume-title":"Context disentangling and prototype inheriting for robust visual grounding","author":"Tang Wei","year":"2023","unstructured":"Wei Tang, Liang Li, Xuejing Liu, Lu Jin, Jinhui Tang, and Zechao Li. 2023. Context disentangling and prototype inheriting for robust visual grounding. IEEE Transactions on Pattern Analysis and Machine Intelligence (2023)."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00476"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01760"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01837"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00644"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2023.3311917"},{"key":"e_1_3_2_1_41_1","first-page":"18123","article-title":"Counterfactual contrastive learning for weakly-supervised vision-language grounding","volume":"33","author":"Zhang Zhu","year":"2020","unstructured":"Zhu Zhang, Zhou Zhao, Zhijie Lin, Xiuqiang He, et al., 2020. Counterfactual contrastive learning for weakly-supervised vision-language grounding. Advances in Neural Information Processing Systems, Vol. 33 (2020), 18123-18134.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2021.3090426"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19833-5_35"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","location":"Dublin Ireland","acronym":"MM '25","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755751","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T03:59:07Z","timestamp":1765339147000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755751"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":43,"alternative-id":["10.1145\/3746027.3755751","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755751","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}