{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,8]],"date-time":"2026-03-08T05:57:50Z","timestamp":1772949470585,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":60,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"National Nature Science Foundation of China","award":["62076032, 62225601, U23B2052"],"award-info":[{"award-number":["62076032, 62225601, U23B2052"]}]},{"name":"Beijing Natural Science Foundation Project","award":["Z200002"],"award-info":[{"award-number":["Z200002"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3680897","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:27Z","timestamp":1729925967000},"page":"4312-4321","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Triple Alignment Strategies for Zero-shot Phrase Grounding under Weak Supervision"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0001-6193-6983","authenticated-orcid":false,"given":"Pengyue","family":"Lin","sequence":"first","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3543-6272","authenticated-orcid":false,"given":"Ruifan","family":"Li","sequence":"additional","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-3122-2679","authenticated-orcid":false,"given":"Yuzhe","family":"Ji","sequence":"additional","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-6411-9420","authenticated-orcid":false,"given":"Zhihan","family":"Yu","sequence":"additional","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4798-4233","authenticated-orcid":false,"given":"Fangxiang","family":"Feng","sequence":"additional","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2950-2488","authenticated-orcid":false,"given":"Zhanyu","family":"Ma","sequence":"additional","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0314-8951","authenticated-orcid":false,"given":"Xiaojie","family":"Wang","sequence":"additional","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01276"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00182"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3350599"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00084"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.95"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.95"},{"key":"e_1_3_2_1_7_1","volume-title":"VTQA2023: ACM Multimedia 2023 Visual Text Question Answering Challenge. In Proceedings of the 31st ACM International Conference on Multimedia. 9646--9650","author":"Chen Kang","year":"2023","unstructured":"Kang Chen, Tianli Zhao, and Xiangqian Wu. 2023. VTQA2023: ACM Multimedia 2023 Visual Text Question Answering Challenge. In Proceedings of the 31st ACM International Conference on Multimedia. 9646--9650."},{"key":"e_1_3_2_1_8_1","volume-title":"Proceedings of the 29th ACM International Conference on Multimedia. 1966'1975","author":"Chen Nenglun","year":"2021","unstructured":"Nenglun Chen, Xingjia Pan, Runnan Chen, Lei Yang, Zhiwen Lin, Yuqiang Ren,Haolei Yuan, Xiaowei Guo, Feiyue Huang, and Wenping Wang. 2021. Distributedattention for grounded image captioning. In Proceedings of the 29th ACM International Conference on Multimedia. 1966'1975."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00269"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"crossref","unstructured":"Thomas Eiter Tobias Geibinger Nelson Higuera and Johannes Oetsch. 2023. A Logic-based Approach to Contrastive Explainability for Neurosymbolic Visual Question Answering.. In IJCAI. 3668--3676.","DOI":"10.24963\/ijcai.2023\/408"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298754"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01470"},{"key":"e_1_3_2_1_13_1","volume-title":"International workshop ontoImage","volume":"2","author":"Grubinger Michael","year":"2006","unstructured":"Michael Grubinger, Paul Clough, Henning M\u00fcller, and Thomas Deselaers. 2006. The iapr tc-12 benchmark: A new evaluation resource for visual information systems. In International workshop ontoImage, Vol. 2. IEEE, Minori, Italy, 13--23."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58580-8_44"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00284"},{"key":"e_1_3_2_1_17_1","volume-title":"Learning unsupervised visual grounding through semantic self-supervision. CoRR abs\/1803.06506","author":"Javed Syed Ashar","year":"2018","unstructured":"Syed Ashar Javed, Shreyas Saxena, and Vineet Gandhi. 2018. Learning unsupervised visual grounding through semantic self-supervision. CoRR abs\/1803.06506 (2018), http:\/\/arxiv.org\/abs\/1803.06506."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01507"},{"key":"e_1_3_2_1_19_1","volume-title":"Proceedings of the 13th International Joint Conference on Natural Language Processing and the 3rd Conference of the Asia-Pacific","author":"Kim Huiju","unstructured":"Huiju Kim, Youjin Kang, and SangKeun Lee. 2023. Examining Consistency of Visual Commonsense Reasoning based on Person Grounding. In Proceedings of the 13th International Joint Conference on Natural Language Processing and the 3rd Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics (Volume 1: Long Papers). 1026--1039."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"crossref","unstructured":"Ranjay Krishna Yuke Zhu Oliver Groth Justin Johnson Kenji Hata Joshua Kravitz Stephanie Chen Yannis Kalantidis Li-Jia Li David A Shamma et al. 2017. Visual genome: Connecting language and vision using crowdsourced dense image annotations. International journal of computer vision 123 (2017) 32--73.","DOI":"10.1007\/s11263-016-0981-7"},{"key":"e_1_3_2_1_21_1","volume-title":"Align before fuse: Vision and language representation learning with momentum distillation. Advances in neural information processing systems 34","author":"Li Junnan","year":"2021","unstructured":"Junnan Li, Ramprasaath Selvaraju, Akhilesh Gotmare, Shafiq Joty, Caiming Xiong, and Steven Chu Hong Hoi. 2021. Align before fuse: Vision and language representation learning with momentum distillation. Advances in neural information processing systems 34 (2021), 9694--9705."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv:2204.03647"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i1.25223"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475575"},{"key":"e_1_3_2_1_25_1","volume-title":"Visual Prompt Tuning for Weakly Supervised Phrase Grounding. In ICASSP 2024-2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). 7895--7899","author":"Lin Pengyue","year":"2024","unstructured":"Pengyue Lin, Zhihan Yu, Mingcong Lu, Fangxiang Feng, Ruifan Li, and Xiaojie Wang. 2024. Visual Prompt Tuning for Weakly Supervised Phrase Grounding. In ICASSP 2024-2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). 7895--7899."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01469"},{"key":"e_1_3_2_1_28_1","first-page":"3003","article-title":"Entity-enhanced adaptive reconstruction network for weakly supervised referring expression grounding","volume":"45","author":"Liu Xuejing","year":"2022","unstructured":"Xuejing Liu, Liang Li, Shuhui Wang, Zheng-Jun Zha, Zechao Li, Qi Tian, and Qingming Huang. 2022. Entity-enhanced adaptive reconstruction network for weakly supervised referring expression grounding. IEEE Transactions on Pattern Analysis and Machine Intelligence 45, 3 (2022), 3003--3018.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00270"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3351074"},{"key":"e_1_3_2_1_31_1","volume-title":"Improving Image Paragraph Captioning with Dual Relations. In 2022 IEEE International Conference on Multimedia and Expo (ICME). IEEE, 1--6.","author":"Liu Yun","year":"2022","unstructured":"Yun Liu, Yihui Shi, Fangxiang Feng, Ruifan Li, Zhanyu Ma, and Xiaojie Wang. 2022. Improving Image Paragraph Captioning with Dual Relations. In 2022 IEEE International Conference on Multimedia and Expo (ICME). IEEE, 1--6."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00556"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00265"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01045"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2024.3374786"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.303"},{"key":"e_1_3_2_1_37_1","volume-title":"Yu","author":"Qin Libo","year":"2024","unstructured":"Libo Qin, Qiguang Chen, Yuhang Zhou, Zhi Chen, Yinghui Li, Lizi Liao, Min Li, Wanxiang Che, and Philip S. Yu. 2024. Multilingual Large Language Model: A Survey of Resources, Taxonomy and Frontiers. arXiv:2404.04925 [cs.CL] https:\/\/arxiv.org\/abs\/2404.04925"},{"key":"e_1_3_2_1_38_1","volume-title":"International conference on machine learning. PMLR","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, Vienna, Austria, 8748--8763."},{"key":"e_1_3_2_1_39_1","first-page":"12116","article-title":"Do vision transformers see like convolutional neural networks","volume":"34","author":"Raghu Maithra","year":"2021","unstructured":"Maithra Raghu, Thomas Unterthiner, Simon Kornblith, Chiyuan Zhang, and Alexey Dosovitskiy. 2021. Do vision transformers see like convolutional neural networks? Advances in Neural Information Processing Systems 34 (2021), 12116--12128.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_40_1","volume-title":"Garnett (Eds.)","volume":"28","author":"Ren Shaoqing","year":"2015","unstructured":"Shaoqing Ren, Kaiming He, Ross Girshick, and Jian Sun. 2015. Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks. In Advances in Neural Information Processing Systems, C. Cortes, N. Lawrence, D. Lee, M. Sugiyama, and R. Garnett (Eds.), Vol. 28. Curran Associates, Inc. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2015\/file\/ 14bfa6bb14875e45bba028a21ed38046-Paper.pdf"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00479"},{"key":"e_1_3_2_1_42_1","first-page":"28222","article-title":"What is where by looking: Weakly-supervised open-world phrase-grounding without text inputs","volume":"35","author":"Shaharabany Tal","year":"2022","unstructured":"Tal Shaharabany, Yoad Tewel, and Lior Wolf. 2022. What is where by looking: Weakly-supervised open-world phrase-grounding without text inputs. Advances in Neural Information Processing Systems 35 (2022), 28222-28237.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00669"},{"key":"e_1_3_2_1_44_1","volume-title":"Proceedings of the AAAI Conference on Artificial Intelligence","volume":"38","author":"Shen Haozhan","year":"2024","unstructured":"Haozhan Shen, Tiancheng Zhao, Mingwei Zhu, and Jianwei Yin. 2024. Ground-VLP: Harnessing Zero-Shot Visual Grounding from Vision-Language Pre-training and Open-Vocabulary Object Detection. In Proceedings of the AAAI Conference on Artificial Intelligence, Vol. 38. 4766--4775."},{"key":"e_1_3_2_1_45_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 15039--15049","author":"Song Yibing","year":"2023","unstructured":"Yibing Song, Ruifei Zhang, Zhihong Chen, Xiang Wan, and Guanbin Li. 2023. Advancing visual grounding with scene knowledge: Benchmark and method. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 15039--15049."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.357"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"crossref","unstructured":"Satoshi Suzuki et al. 1985. Topological structural analysis of digitized binary images by border following. Computer vision graphics and image processing 30 1 (1985) 32--46.","DOI":"10.1016\/0734-189X(85)90016-7"},{"key":"e_1_3_2_1_48_1","volume-title":"SCLIP: Rethinking Self-Attention for Dense Vision-Language Inference. arXiv:2312.01597 [cs.CV] https:\/\/arxiv. org\/abs\/2312.01597","author":"Wang Feng","year":"2024","unstructured":"Feng Wang, Jieru Mei, and Alan Yuille. 2024. SCLIP: Rethinking Self-Attention for Dense Vision-Language Inference. arXiv:2312.01597 [cs.CV] https:\/\/arxiv. org\/abs\/2312.01597"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46484-8_42"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-main.159"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611737"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548283"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"crossref","unstructured":"Yuechen Wu Zhenhuan Rao Wei Zhang Shijian Lu Weizhi Lu and Zheng-Jun Zha. 2019. Exploring the Task Cooperation in Multi-goal Visual Navigation.. In IJCAI. 609--615.","DOI":"10.24963\/ijcai.2019\/86"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00427"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00238"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01276"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-017-1059-x"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413746"},{"key":"e_1_3_2_1_59_1","first-page":"58508","volume-title":"Levine (Eds.)","volume":"36","author":"Zhang Zicheng","year":"2023","unstructured":"Zicheng Zhang, Bonan Li, Xuecheng Nie, Congying Han, Tiande Guo, and Luoqi Liu. 2023. Towards Consistent Video Editing with Text-to-Image Diffusion Models. In Advances in Neural Information Processing Systems, A. Oh, T. Naumann, A. Globerson, K. Saenko, M. Hardt, and S. Levine (Eds.), Vol. 36. Curran Associates, Inc., 58508-58519. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2023\/file\/b6c05f8254a00709e16fb0fdaae56cd8-Paper-Conference.pdf"},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19815-1_40"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680897","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3680897","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:33Z","timestamp":1750295853000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680897"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":60,"alternative-id":["10.1145\/3664647.3680897","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3680897","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}