{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,2]],"date-time":"2025-11-02T19:09:16Z","timestamp":1762110556363,"version":"build-2065373602"},"publisher-location":"New York, NY, USA","reference-count":48,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["2021YFB1714300"],"award-info":[{"award-number":["2021YFB1714300"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Beihang World TOP University Cooperation Program"},{"name":"Beijing Natural Science Foundation","award":["L231011"],"award-info":[{"award-number":["L231011"]}]},{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100018537","name":"National Science and Technology Major Project","doi-asserted-by":"publisher","award":["2022ZD0115502"],"award-info":[{"award-number":["2022ZD0115502"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100018537","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62122010, U23B2010, 62132001"],"award-info":[{"award-number":["62122010, U23B2010, 62132001"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Zhejiang Provincial Natural Science Foundation of China","award":["LDT23F02022F02"],"award-info":[{"award-number":["LDT23F02022F02"]}]},{"name":"Meituan"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3686836","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:49Z","timestamp":1729925989000},"page":"9485-9494","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["Dynamic Prompting of Frozen Text-to-Image Diffusion Models for Panoptic Narrative Grounding"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0001-9261-4012","authenticated-orcid":false,"given":"Hongyu","family":"Li","sequence":"first","affiliation":[{"name":"School of Artificial Intelligence, Beihang University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1172-1554","authenticated-orcid":false,"given":"Tianrui","family":"Hui","sequence":"additional","affiliation":[{"name":"School of Computer Science and Information Engineering, Hefei University of Technology, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9863-5754","authenticated-orcid":false,"given":"Zihan","family":"Ding","sequence":"additional","affiliation":[{"name":"School of Artificial Intelligence, Beihang University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3516-0111","authenticated-orcid":false,"given":"Jing","family":"Zhang","sequence":"additional","affiliation":[{"name":"School of Software, Beihang University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8371-0590","authenticated-orcid":false,"given":"Bin","family":"Ma","sequence":"additional","affiliation":[{"name":"Meituan, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7471-8344","authenticated-orcid":false,"given":"Xiaoming","family":"Wei","sequence":"additional","affiliation":[{"name":"Meituan, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1107-3873","authenticated-orcid":false,"given":"Jizhong","family":"Han","sequence":"additional","affiliation":[{"name":"Institute of Information Engineering, Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9180-2935","authenticated-orcid":false,"given":"Si","family":"Liu","sequence":"additional","affiliation":[{"name":"School of Artificial Intelligence, Beihang University, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00135"},{"key":"e_1_3_2_1_2_1","volume-title":"Advances in Neural Information Processing Systems","volume":"34","author":"Cheng Bowen","year":"2021","unstructured":"Bowen Cheng, Alex Schwing, and Alexander Kirillov. 2021. Per-pixel classification is not all you need for semantic segmentation. Advances in Neural Information Processing Systems, Vol. 34 (2021)."},{"key":"e_1_3_2_1_3_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01601"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548086"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00491"},{"key":"e_1_3_2_1_7_1","volume-title":"Encoder Fusion Network with Co-Attention Embedding for Referring Image Segmentation. In 2021 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). IEEE, 15501--15510","author":"Feng Guang","year":"2021","unstructured":"Guang Feng, Zhiwei Hu, Lihe Zhang, and Huchuan Lu. 2021. Encoder Fusion Network with Co-Attention Embedding for Referring Image Segmentation. In 2021 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). IEEE, 15501--15510."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00140"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2023.3286760"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46448-0_7"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01050"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2023\/99"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00417"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2023.3235720"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58607-2_4"},{"key":"e_1_3_2_1_17_1","volume-title":"Diffusion models for zero-shot open-vocabulary segmentation. arXiv preprint arXiv:2306.09316","author":"Karazija Laurynas","year":"2023","unstructured":"Laurynas Karazija, Iro Laina, Andrea Vedaldi, and Christian Rupprecht. 2023. Diffusion models for zero-shot open-vocabulary segmentation. arXiv preprint arXiv:2306.09316 (2023)."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01761"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01069"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"e_1_3_2_1_21_1","volume-title":"Context Does Matter: End-to-end Panoptic Narrative Grounding with Deformable Attention Refined Matching Network. In 2023 IEEE International Conference on Data Mining (ICDM). IEEE, 1163--1168","author":"Lin Yiming","year":"2023","unstructured":"Yiming Lin, Xiao-Bo Jin, Qiufeng Wang, and Kaizhu Huang. 2023. Context Does Matter: End-to-end Panoptic Narrative Grounding with Deformable Attention Refined Matching Network. In 2023 IEEE International Conference on Data Mining (ICDM). IEEE, 1163--1168."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01789"},{"key":"e_1_3_2_1_23_1","first-page":"4761","article-title":"Cross-modal progressive comprehension for referring segmentation","volume":"44","author":"Liu Si","year":"2021","unstructured":"Si Liu, Tianrui Hui, Shaofei Huang, Yunchao Wei, Bo Li, and Guanbin Li. 2021. Cross-modal progressive comprehension for referring segmentation. IEEE Transactions on Pattern Analysis and Machine Intelligence, Vol. 44, 9 (2021), 4761--4775.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"e_1_3_2_1_24_1","volume-title":"Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692","author":"Liu Yinhan","year":"2019","unstructured":"Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, and Veselin Stoyanov. 2019. Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692 (2019)."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298965"},{"key":"e_1_3_2_1_26_1","volume-title":"Multi-Task Collaborative Network for Joint Referring Expression Comprehension and Segmentation. In 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). IEEE, 10031--10040","author":"Luo Gen","year":"2020","unstructured":"Gen Luo, Yiyi Zhou, Xiaoshuai Sun, Liujuan Cao, Chenglin Wu, Cheng Deng, and Rongrong Ji. 2020. Multi-Task Collaborative Network for Joint Referring Expression Comprehension and Segmentation. In 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). IEEE, 10031--10040."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58558-7_38"},{"key":"e_1_3_2_1_28_1","volume-title":"International conference on machine learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748--8763."},{"key":"e_1_3_2_1_29_1","unstructured":"Alec Radford Jeffrey Wu Rewon Child David Luan Dario Amodei Ilya Sutskever et al. 2019. Language models are unsupervised multitask learners. OpenAI blog Vol. 1 8 (2019) 9."},{"key":"e_1_3_2_1_30_1","first-page":"1","article-title":"Exploring the limits of transfer learning with a unified text-to-text transformer","volume":"21","author":"Raffel Colin","year":"2020","unstructured":"Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, and Peter J Liu. 2020. Exploring the limits of transfer learning with a unified text-to-text transformer. Journal of machine learning research, Vol. 21, 140 (2020), 1--67.","journal-title":"Journal of machine learning research"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_2_1_32_1","volume-title":"U-net: Convolutional networks for biomedical image segmentation. In Medical Image Computing and Computer-Assisted Intervention--MICCAI 2015: 18th International Conference","author":"Ronneberger Olaf","year":"2015","unstructured":"Olaf Ronneberger, Philipp Fischer, and Thomas Brox. 2015. U-net: Convolutional networks for biomedical image segmentation. In Medical Image Computing and Computer-Assisted Intervention--MICCAI 2015: 18th International Conference, Munich, Germany, October 5--9, 2015, Proceedings, Part III 18. Springer, 234--241."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-67558-9_28"},{"key":"e_1_3_2_1_34_1","volume-title":"Attention is all you need. Advances in neural information processing systems","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems, Vol. 30 (2017)."},{"key":"e_1_3_2_1_35_1","volume-title":"NICE: Improving Panoptic Narrative Detection and Segmentation with Cascading Collaborative Learning. arXiv preprint arXiv:2310.10975","author":"Wang Haowei","year":"2023","unstructured":"Haowei Wang, Jiayi Ji, Tianyu Guo, Yilong Yang, Yiyi Zhou, Xiaoshuai Sun, and Rongrong Ji. 2023. NICE: Improving Panoptic Narrative Detection and Segmentation with Cascading Collaborative Learning. arXiv preprint arXiv:2310.10975 (2023)."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i2.25350"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-022-01721-6"},{"key":"e_1_3_2_1_38_1","volume-title":"Diffusion model is secretly a training-free open vocabulary semantic segmenter. arXiv preprint arXiv:2309.02773","author":"Wang Jinglong","year":"2023","unstructured":"Jinglong Wang, Xiawei Li, Jing Zhang, Qingyuan Xu, Qin Zhou, Qian Yu, Lu Sheng, and Dong Xu. 2023. Diffusion model is secretly a training-free open vocabulary semantic segmenter. arXiv preprint arXiv:2309.02773 (2023)."},{"key":"e_1_3_2_1_39_1","volume-title":"BarLeRIa: An Efficient Tuning Framework for Referring Image Segmentation. In The Twelfth International Conference on Learning Representations.","author":"Wang Yaoming","year":"2024","unstructured":"Yaoming Wang, Jin Li, Xiaopeng Zhang, Bowen Shi, Chenglin Li, Wenrui Dai, Hongkai Xiong, and Qi Tian. 2024. BarLeRIa: An Efficient Tuning Framework for Referring Image Segmentation. In The Twelfth International Conference on Learning Representations."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01139"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00117"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00289"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01605"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01762"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i3.25428"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413846"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00527"},{"key":"e_1_3_2_1_48_1","volume-title":"Deformable detr: Deformable transformers for end-to-end object detection. arXiv preprint arXiv:2010.04159","author":"Zhu Xizhou","year":"2020","unstructured":"Xizhou Zhu, Weijie Su, Lewei Lu, Bin Li, Xiaogang Wang, and Jifeng Dai. 2020. Deformable detr: Deformable transformers for end-to-end object detection. arXiv preprint arXiv:2010.04159 (2020)."}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Melbourne VIC Australia","acronym":"MM '24"},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3686836","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3686836","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:28Z","timestamp":1750295848000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3686836"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":48,"alternative-id":["10.1145\/3664647.3686836","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3686836","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}