{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,17]],"date-time":"2026-03-17T00:00:09Z","timestamp":1773705609759,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":35,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,5,30]],"date-time":"2024-05-30T00:00:00Z","timestamp":1717027200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/501100006374","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["U21B2043,62206279"],"award-info":[{"award-number":["U21B2043,62206279"]}],"id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100006374","name":"National Science and Technology Major Project","doi-asserted-by":"publisher","award":["2022ZD0118801"],"award-info":[{"award-number":["2022ZD0118801"]}],"id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,5,30]]},"DOI":"10.1145\/3652583.3658095","type":"proceedings-article","created":{"date-parts":[[2024,6,7]],"date-time":"2024-06-07T06:30:40Z","timestamp":1717741840000},"page":"451-459","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Calibration &amp; Reconstruction: Deeply Integrated Language for Referring Image Segmentation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-7620-4214","authenticated-orcid":false,"given":"Yichen","family":"Yan","sequence":"first","affiliation":[{"name":"Institute of Automation, Chinese Academy of Sciences &amp; School of Artificial Intelligence, University of Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5396-6253","authenticated-orcid":false,"given":"Xingjian","family":"He","sequence":"additional","affiliation":[{"name":"Institute of Automation, Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-3539-8085","authenticated-orcid":false,"given":"Sihan","family":"Chen","sequence":"additional","affiliation":[{"name":"School of Artificial Intelligence, University of Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0903-9131","authenticated-orcid":false,"given":"Jing","family":"Liu","sequence":"additional","affiliation":[{"name":"Institute of Automation, Chinese Academy of Sciences &amp; School of Artificial Intelligence, University of Chinese Academy of Sciences, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2024,6,7]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Jamie Ryan Kiros, and Geoffrey E Hinton","author":"Ba Jimmy Lei","year":"2016","unstructured":"Jimmy Lei Ba, Jamie Ryan Kiros, and Geoffrey E Hinton. 2016. Layer normalization. arXiv preprint arXiv:1607.06450."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"e_1_3_2_1_3_1","unstructured":"Shizhe Chen Pierre-Louis Guhur Cordelia Schmid and Ivan Laptev. 2021. History aware multimodal transformer for vision-and-language navigation. Advances in neural information processing systems 34 5834--5847."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"crossref","unstructured":"Zesen Cheng Kehan Li Peng Jin Xiangyang Ji Li Yuan Chang Liu and Jie Chen. 2023. Parallel vertex diffusion for unified visual grounding. arXiv preprint arXiv:2303.07216.","DOI":"10.1609\/aaai.v38i2.27896"},{"key":"e_1_3_2_1_5_1","unstructured":"Jacob Devlin Ming-Wei Chang Kenton Lee and Kristina Toutanova. 2018. Bert: pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805."},{"key":"e_1_3_2_1_6_1","volume-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision, 16321-- 16330","author":"Ding Henghui","year":"2021","unstructured":"Henghui Ding, Chang Liu, Suchen Wang, and Xudong Jiang. 2021. Visionlanguage transformer and query generation for referring segmentation. In Proceedings of the IEEE\/CVF International Conference on Computer Vision, 16321-- 16330."},{"key":"e_1_3_2_1_7_1","article-title":"Vlt: visionlanguage transformer and query generation for referring segmentation","author":"Ding Henghui","year":"2022","unstructured":"Henghui Ding, Chang Liu, Suchen Wang, and Xudong Jiang. 2022. Vlt: visionlanguage transformer and query generation for referring segmentation. IEEE Transactions on Pattern Analysis and Machine Intelligence.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence."},{"key":"e_1_3_2_1_8_1","unstructured":"Alexey Dosovitskiy et al. 2020. An image is worth 16x16 words: transformers for image recognition at scale. arXiv preprint arXiv:2010.11929."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01525"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46448-0_7"},{"key":"e_1_3_2_1_11_1","volume-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, 4424--4433","author":"Hu Zhiwei","year":"2020","unstructured":"Zhiwei Hu, Guang Feng, Jiayu Sun, Lihe Zhang, and Huchuan Lu. 2020. Bidirectional relationship inferring network for referring image segmentation. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, 4424--4433."},{"key":"e_1_3_2_1_12_1","unstructured":"Zhicheng Huang Zhaoyang Zeng Bei Liu Dongmei Fu and Jianlong Fu. 2020. Pixel-bert: aligning image pixels with text by deep multi-modal transformers. arXiv preprint arXiv:2004.00849."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00973"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01761"},{"key":"e_1_3_2_1_15_1","volume-title":"International Conference on Machine Learning. PMLR, 5583--5594","author":"Kim Wonjae","year":"2021","unstructured":"Wonjae Kim, Bokyung Son, and Ildoo Kim. 2021. Vilt: vision-and-language transformer without convolution or region supervision. In International Conference on Machine Learning. PMLR, 5583--5594."},{"key":"e_1_3_2_1_16_1","volume-title":"Referring transformer: a one-step approach to multi-task visual grounding. Advances in neural information processing systems, 34","author":"Li Muchen","year":"1965","unstructured":"Muchen Li and Leonid Sigal. 2021. Referring transformer: a one-step approach to multi-task visual grounding. Advances in neural information processing systems, 34, 19652--19664."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00602"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"e_1_3_2_1_19_1","article-title":"Multi-modal mutual attention and iterative interaction for referring image segmentation","author":"Liu Chang","year":"2023","unstructured":"Chang Liu, Henghui Ding, Yulun Zhang, and Xudong Jiang. 2023. Multi-modal mutual attention and iterative interaction for referring image segmentation. IEEE Transactions on Image Processing.","journal-title":"IEEE Transactions on Image Processing."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.143"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"e_1_3_2_1_22_1","unstructured":"Jiasen Lu Dhruv Batra Devi Parikh and Stefan Lee. 2019. Vilbert: pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. Advances in neural information processing systems 32."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3414006"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01005"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.9"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46493-0_48"},{"key":"e_1_3_2_1_27_1","volume-title":"International Conference on Machine Learning. PMLR, 8821--8831","author":"Ramesh Aditya","year":"2021","unstructured":"Aditya Ramesh, Mikhail Pavlov, Gabriel Goh, Scott Gray, Chelsea Voss, Alec Radford, Mark Chen, and Ilya Sutskever. 2021. Zero-shot text-to-image generation. In International Conference on Machine Learning. PMLR, 8821--8831."},{"key":"e_1_3_2_1_28_1","unstructured":"Ashish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan N Gomez \u0141ukasz Kaiser and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems 30."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00679"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01139"},{"key":"e_1_3_2_1_31_1","unstructured":"Jianzong Wu Xiangtai Li Xia Li Henghui Ding Yunhai Tong and Dacheng Tao. 2023. Towards robust referring image segmentation. arXiv preprint arXiv:2209.09554."},{"key":"e_1_3_2_1_32_1","first-page":"12077","article-title":"Segformer: simple and efficient design for semantic segmentation with transformers","volume":"34","author":"Xie Enze","year":"2021","unstructured":"Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M Alvarez, and Ping Luo. 2021. Segformer: simple and efficient design for semantic segmentation with transformers. Advances in Neural Information Processing Systems, 34, 12077--12090.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01762"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00142"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46475-6_5"}],"event":{"name":"ICMR '24: International Conference on Multimedia Retrieval","location":"Phuket Thailand","acronym":"ICMR '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia","SIGSOFT ACM Special Interest Group on Software Engineering"]},"container-title":["Proceedings of the 2024 International Conference on Multimedia Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3652583.3658095","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3652583.3658095","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T08:47:50Z","timestamp":1755766070000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3652583.3658095"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,5,30]]},"references-count":35,"alternative-id":["10.1145\/3652583.3658095","10.1145\/3652583"],"URL":"https:\/\/doi.org\/10.1145\/3652583.3658095","relation":{},"subject":[],"published":{"date-parts":[[2024,5,30]]},"assertion":[{"value":"2024-06-07","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}