{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,5,11]],"date-time":"2025-05-11T04:01:53Z","timestamp":1746936113905,"version":"3.40.5"},"reference-count":69,"publisher":"Springer Science and Business Media LLC","issue":"6","license":[{"start":{"date-parts":[[2025,1,23]],"date-time":"2025-01-23T00:00:00Z","timestamp":1737590400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,1,23]],"date-time":"2025-01-23T00:00:00Z","timestamp":1737590400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2025,6]]},"DOI":"10.1007\/s11263-024-02325-y","type":"journal-article","created":{"date-parts":[[2025,1,23]],"date-time":"2025-01-23T09:42:07Z","timestamp":1737625327000},"page":"3597-3612","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["A Mutual Supervision Framework for Referring Expression Segmentation and Generation"],"prefix":"10.1007","volume":"133","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-2748-0748","authenticated-orcid":false,"given":"Shijia","family":"Huang","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Feng","family":"Li","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hao","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shilong","family":"Liu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Lei","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Liwei","family":"Wang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,1,23]]},"reference":[{"key":"2325_CR1","unstructured":"Banerjee, S., & Lavie, A. (2005). METEOR: An automatic metric for MT evaluation with improved correlation with human judgments. In Goldstein, J., Lavie, A., Lin, C.-Y., & Voss, C. R. (eds.), Proceedings of the workshop on intrinsic and extrinsic evaluation measures for machine translation and\/or summarization@ACL 2005, Ann Arbor, Michigan, USA, June 29, 2005 (pp. 65\u201372). Association for Computational Linguistics. https:\/\/aclanthology.org\/W05-0909\/."},{"key":"2325_CR2","unstructured":"Bengio, S., Vinyals, O., Jaitly, N., & Shazeer, N. (2015). Scheduled sampling for sequence prediction with recurrent neural networks. In Cortes, C., Lawrence, N. D., Lee, D. D., Sugiyama, M., & Garnett, R. (eds.), Advances in neural information processing systems 28: annual conference on neural information processing systems 2015, December 7\u201312, 2015, Montreal, Quebec, Canada (pp. 1171\u20131179). https:\/\/proceedings.neurips.cc\/paper\/2015\/hash\/e995f98d56967d946471af29d7bf99f1-Abstract.html."},{"key":"2325_CR3","doi-asserted-by":"publisher","unstructured":"Carion, N., Massa, F., Synnaeve, G., Usunier, N., Kirillov, A., & Zagoruyko, S. (2020). End-to-end object detection with transformers. In Vedaldi, A., Bischof, H., Brox, T., & Frahm, J.-M. (eds.), Computer vision - ECCV 2020 - 16th European conference, Glasgow, UK, August 23\u201328, 2020, proceedings, Part I, volume 12346 of Lecture Notes in Computer Science (pp. 213\u2013229). Springer. https:\/\/doi.org\/10.1007\/978-3-030-58452-8_13.","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"2325_CR4","unstructured":"Chen, Y.-W., Tsai, Y.-H., Wang, T., Lin, Y.-Y., & Yang, M.-H. (2019b). Referring expression object segmentation with caption-aware consistency. In 30th British machine vision conference 2019, BMVC 2019, Cardiff, UK, September 9\u201312, 2019 (p. 263). BMVA Press. https:\/\/bmvc2019.org\/wp-content\/uploads\/papers\/0196-paper.pdf."},{"key":"2325_CR5","doi-asserted-by":"publisher","unstructured":"Chen, D.-J., Jia, S., Lo, Y.-C., Chen, H.-T., & Liu, T.-L. (2019a). See-through-text grouping for referring image segmentation. In 2019 IEEE\/CVF international conference on computer vision, ICCV 2019, Seoul, Korea (South), October 27\u2013November 2, 2019 (pp. 7453\u20137462). IEEE. https:\/\/doi.org\/10.1109\/ICCV.2019.00755.","DOI":"10.1109\/ICCV.2019.00755"},{"key":"2325_CR6","doi-asserted-by":"publisher","unstructured":"Cheng, B., Misra, I., Schwing, A.\u00a0G., Kirillov, A., & Girdhar, R. (2022). Masked-attention mask transformer for universal image segmentation. In IEEE\/CVF conference on computer vision and pattern recognition, CVPR 2022, New Orleans, LA, USA, June 18\u201324, 2022 (pp. 1280\u20131289). IEEE. https:\/\/doi.org\/10.1109\/CVPR52688.2022.00135.","DOI":"10.1109\/CVPR52688.2022.00135"},{"key":"2325_CR7","unstructured":"Cho, J., Lei, J., Tan, H., & Bansal, M. (2021). Unifying vision-and-language tasks via text generation. In Meila, M., & Zhang, T. (eds.), Proceedings of the 38th international conference on machine learning, ICML 2021, 18\u201324 July 2021, virtual event, volume 139 of Proceedings of machine learning research (pp. 1931\u20131942). PMLR. http:\/\/proceedings.mlr.press\/v139\/cho21a.html."},{"key":"2325_CR8","doi-asserted-by":"publisher","unstructured":"Deng, J., Yang, Z., Chen, T., Zhou, W., & Li, H. (2021). Transvg: End-to-end visual grounding with transformers. In 2021 IEEE\/CVF international conference on computer vision, ICCV 2021, Montreal, QC, Canada, October 10\u201317, 2021 (pp. 1749\u20131759). IEEE. https:\/\/doi.org\/10.1109\/ICCV48922.2021.00179.","DOI":"10.1109\/ICCV48922.2021.00179"},{"issue":"3","key":"2325_CR9","doi-asserted-by":"publisher","first-page":"1670","DOI":"10.1109\/TPAMI.2020.3023438","volume":"44","author":"C Deng","year":"2022","unstructured":"Deng, C., Qi, W., Qingyao, W., Fuyuan, H., Lyu, F., & Tan, M. (2022). Visual grounding via accumulated attention. IEEE Transactions on Pattern Analysis and Machine Intelligence, 44(3), 1670\u20131684. https:\/\/doi.org\/10.1109\/TPAMI.2020.3023438","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"2325_CR10","doi-asserted-by":"publisher","unstructured":"Devlin, J., Chang, M.-W., Lee, K., & Toutanova, K. (2019). BERT: Pre-training of deep bidirectional transformers for language understanding. In Burstein, J., Doran, C., & Solorio, T. (eds.), Proceedings of the 2019 conference of the North American chapter of the association for computational linguistics: Human language technologies, NAACL-HLT 2019, Minneapolis, MN, USA, June 2\u20137, 2019, Volume 1 (long and short papers) (pp. 4171\u20134186). Association for Computational Linguistics. https:\/\/doi.org\/10.18653\/V1\/N19-1423.","DOI":"10.18653\/V1\/N19-1423"},{"key":"2325_CR11","doi-asserted-by":"publisher","unstructured":"Ding, H., Liu, C., Wang, S., & Jiang, X. (2021). Vision-language transformer and query generation for referring segmentation. In 2021 IEEE\/cvf international conference on computer vision, ICCV 2021, Montreal, QC, Canada, October 10\u201317, 2021 (pp. 16301\u201316310). IEEE. https:\/\/doi.org\/10.1109\/ICCV48922.2021.01601.","DOI":"10.1109\/ICCV48922.2021.01601"},{"key":"2325_CR12","unstructured":"Dognin, P.\u00a0L., Melnyk, I., Mroueh, Y., Ross, J., & Sercu, T. (2019). Improved adversarial image captioning. In Deep generative models for highly structured data, ICLR 2019 workshop, New Orleans, Louisiana, United States, May 6, 2019. OpenReview.net. https:\/\/openreview.net\/forum?id=rkepX8LFuE."},{"issue":"4","key":"2325_CR13","doi-asserted-by":"publisher","first-page":"299","DOI":"10.1080\/09540090802413145","volume":"20","author":"F Doshi","year":"2008","unstructured":"Doshi, F., & Roy, N. (2008). Spoken language interaction with model uncertainty: An adaptive human-robot interaction system. Connection Science, 20(4), 299\u2013318. https:\/\/doi.org\/10.1080\/09540090802413145","journal-title":"Connection Science"},{"key":"2325_CR14","doi-asserted-by":"publisher","unstructured":"Freitag, M., & Al-Onaizan, Y. (2017). Beam search strategies for neural machine translation. In Luong, T., Birch, A., Neubig, G., & Finch, A. W. (eds.), Proceedings of the first workshop on neural machine translation, NMT@ACL 2017, Vancouver, Canada, August 4, 2017 (pp. 56\u201360). Association for Computational Linguistics. https:\/\/doi.org\/10.18653\/V1\/W17-3207.","DOI":"10.18653\/V1\/W17-3207"},{"key":"2325_CR15","doi-asserted-by":"publisher","unstructured":"He, K., Gkioxari, G., Doll\u00e1r, P., & Girshick, R.\u00a0B. (2017). Mask R-CNN. In IEEE International conference on computer vision, ICCV 2017, Venice, Italy, October 22\u201329, 2017 (pp. 2980\u20132988). IEEE Computer Society. https:\/\/doi.org\/10.1109\/ICCV.2017.322.","DOI":"10.1109\/ICCV.2017.322"},{"key":"2325_CR16","doi-asserted-by":"publisher","unstructured":"He, K., Zhang, X., Ren, S., & Sun, J. (2016). Deep residual learning for image recognition. In 2016 IEEE conference on computer vision and pattern recognition, CVPR 2016, Las Vegas, NV, USA, June 27\u201330, 2016 (pp. 770\u2013778). IEEE Computer Society. https:\/\/doi.org\/10.1109\/CVPR.2016.90.","DOI":"10.1109\/CVPR.2016.90"},{"key":"2325_CR17","doi-asserted-by":"publisher","unstructured":"Hu, R., Rohrbach, M., & Darrell, T. (2016). Segmentation from natural language expressions. In Leibe, B., Matas, J., Sebe, N., & Welling, M. (eds.), Computer vision - ECCV 2016 - 14th European conference, Amsterdam, The Netherlands, October 11\u201314, 2016, proceedings, Part I, volume 9905 of Lecture Notes in Computer Science (pp. 108\u2013124). Springer. https:\/\/doi.org\/10.1007\/978-3-319-46448-0_7.","DOI":"10.1007\/978-3-319-46448-0_7"},{"key":"2325_CR18","doi-asserted-by":"publisher","unstructured":"Hu, Z., Feng, G., Sun, J., Zhang, L., & Lu, H. (2020). Bi-directional relationship inferring network for referring image segmentation. In 2020 IEEE\/CVF conference on computer vision and pattern recognition, CVPR 2020, Seattle, WA, USA, June 13\u201319, 2020 (pp. 4423\u20134432). Computer Vision Foundation\/IEEE. https:\/\/doi.org\/10.1109\/CVPR42600.2020.00448. https:\/\/openaccess.thecvf.com\/content_CVPR_2020\/html\/Hu_Bi-Directional_Relationship_Inferring_Network_for_Referring_Image_Segmentation_CVPR_2020_paper.html.","DOI":"10.1109\/CVPR42600.2020.00448"},{"key":"2325_CR19","doi-asserted-by":"publisher","unstructured":"Huang, S., Chen, Y., Jia, J., & Wang, L. (2022). Multi-view transformer for 3d visual grounding. In IEEE\/CVF conference on computer vision and pattern recognition, CVPR 2022, New Orleans, LA, USA, June 18\u201324, 2022 (pp. 15503\u201315512). IEEE. https:\/\/doi.org\/10.1109\/CVPR52688.2022.01508.","DOI":"10.1109\/CVPR52688.2022.01508"},{"key":"2325_CR20","doi-asserted-by":"publisher","unstructured":"Huang, S., Hui, T., Liu, S., Li, G., Wei, Y., Han, J., Liu, L., & Li, B. (2020). Referring image segmentation via cross-modal progressive comprehension. In 2020 IEEE\/CVF conference on computer vision and pattern recognition, CVPR 2020, Seattle, WA, USA, June 13\u201319, 2020 (pp. 10485\u201310494). Computer Vision Foundation\/IEEE. https:\/\/doi.org\/10.1109\/CVPR42600.2020.01050. https:\/\/openaccess.thecvf.com\/content_CVPR_2020\/html\/Huang_Referring_Image_Segmentation_via_Cross-Modal_Progressive_Comprehension_CVPR_2020_paper.html.","DOI":"10.1109\/CVPR42600.2020.01050"},{"key":"2325_CR21","doi-asserted-by":"publisher","unstructured":"Hui, T., Liu, S., Huang, S., Li, G., Yu, S., Zhang, F., & Han, J. (2020). Linguistic structure guided context modeling for referring image segmentation. In Vedaldi, A., Bischof, H., Brox, T., & Frahm, J.-M. (eds.), Computer vision - ECCV 2020 - 16th European conference, Glasgow, UK, August 23\u201328, 2020, proceedings, Part X, volume 12355 of Lecture Notes in Computer Science (pp. 59\u201375). Springer. https:\/\/doi.org\/10.1007\/978-3-030-58607-2_4.","DOI":"10.1007\/978-3-030-58607-2_4"},{"key":"2325_CR22","doi-asserted-by":"publisher","unstructured":"Jiang, H., Lin, Y., Han, D., Song, S., & Huang, G. (2022). Pseudo-q: Generating pseudo language queries for visual grounding. In IEEE\/CVF conference on computer vision and pattern recognition, CVPR 2022, New Orleans, LA, USA, June 18\u201324, 2022 (pp. 15492\u201315502). IEEE. https:\/\/doi.org\/10.1109\/CVPR52688.2022.01507.","DOI":"10.1109\/CVPR52688.2022.01507"},{"key":"2325_CR23","doi-asserted-by":"publisher","unstructured":"Jing, Y., Kong, T., Wang, W., Wang, L., Li, L., & Tan, T. (2021). Locate then segment: A strong pipeline for referring image segmentation. In IEEE conference on computer vision and pattern recognition, CVPR 2021, virtual, June 19\u201325, 2021 (pp. 9858\u20139867. Computer Vision Foundation\/IEEE. https:\/\/doi.org\/10.1109\/CVPR46437.2021.00973. https:\/\/openaccess.thecvf.com\/content\/CVPR2021\/html\/Jing_Locate_Then_Segment_A_Strong_Pipeline_for_Referring_Image_Segmentation_CVPR_2021_paper.html.","DOI":"10.1109\/CVPR46437.2021.00973"},{"key":"2325_CR24","doi-asserted-by":"publisher","unstructured":"Kamath, A., Singh, M., LeCun, Y., Synnaeve, G., Misra, I., & Carion, N. (2021). MDETR-modulated detection for end-to-end multi-modal understanding. In 2021 IEEE\/CVF international conference on computer vision, ICCV 2021, Montreal, QC, Canada, October 10\u201317, 2021 (pp. 1760\u20131770). IEEE. https:\/\/doi.org\/10.1109\/ICCV48922.2021.00180.","DOI":"10.1109\/ICCV48922.2021.00180"},{"key":"2325_CR25","doi-asserted-by":"publisher","unstructured":"Kazemzadeh, S., Ordonez, V., Matten, M., & Berg, T.\u00a0L. (2014). Referitgame: Referring to objects in photographs of natural scenes. In Moschitti, A., Pang, B., & Daelemans, W. (eds.), Proceedings of the 2014 conference on empirical methods in natural language processing, EMNLP 2014, October 25\u201329, 2014, Doha, Qatar, A meeting of SIGDAT, a Special Interest Group of the ACL (pp. 787\u2013798). ACL. https:\/\/doi.org\/10.3115\/V1\/D14-1086.","DOI":"10.3115\/V1\/D14-1086"},{"key":"2325_CR26","doi-asserted-by":"publisher","unstructured":"Kim, J., Ko, H., & Wu, J. (2020). Conan: A complementary neighboring-based attention network for referring expression generation. In Scott, D., Bel, N., & Zong, C. (eds.), Proceedings of the 28th international conference on computational linguistics, COLING 2020, Barcelona, Spain (Online), December 8\u201313, 2020 (pp. 1952\u20131962). International Committee on Computational Linguistics. https:\/\/doi.org\/10.18653\/V1\/2020.COLING-MAIN.177.","DOI":"10.18653\/V1\/2020.COLING-MAIN.177"},{"key":"2325_CR27","doi-asserted-by":"publisher","unstructured":"Kim, N., Kim, D., Kwak, S., Lan, C., & Zeng, W. (2022). Restr: Convolution-free referring image segmentation using transformers. In IEEE\/CVF conference on computer vision and pattern recognition, CVPR 2022, New Orleans, LA, USA, June 18\u201324, 2022 (pp. 18124\u201318133). IEEE. https:\/\/doi.org\/10.1109\/CVPR52688.2022.01761.","DOI":"10.1109\/CVPR52688.2022.01761"},{"issue":"1","key":"2325_CR28","doi-asserted-by":"publisher","first-page":"32","DOI":"10.1007\/S11263-016-0981-7","volume":"123","author":"R Krishna","year":"2017","unstructured":"Krishna, R., Zhu, Y., Groth, O., Johnson, J., Hata, K., Kravitz, J., Chen, S., Kalantidis, Y., Li, L.-J., Shamma, D. A., Bernstein, M. S., & Fei-Fei, L. (2017). Visual genome: Connecting language and vision using crowdsourced dense image annotations. International Journal of Computer Vision, 123(1), 32\u201373. https:\/\/doi.org\/10.1007\/S11263-016-0981-7","journal-title":"International Journal of Computer Vision"},{"key":"2325_CR29","unstructured":"Li, M., & Sigal, L. (2021). Referring transformer: A one-step approach to multi-task visual grounding. In Ranzato, M., Beygelzimer, A., Dauphin, Y. N., Liang, P., & Vaughan, J. W. (eds.), Advances in neural information processing systems 34: annual conference on neural information processing systems 2021, NeurIPS 2021, December 6\u201314, 2021, virtual (pp. 19652\u201319664). https:\/\/proceedings.neurips.cc\/paper\/2021\/hash\/a376802c0811f1b9088828288eb0d3f0-Abstract.html."},{"key":"2325_CR30","doi-asserted-by":"publisher","unstructured":"Li, R., Li, K., Kuo, Y.-C., Shu, M., Qi, X., Shen, X., & Jia, J. (2018). Referring image segmentation via recurrent refinement networks. In 2018 IEEE conference on computer vision and pattern recognition, CVPR 2018, Salt Lake City, UT, USA, June 18\u201322, 2018 (pp. 5745\u20135753). Computer Vision Foundation\/IEEE Computer Society. https:\/\/doi.org\/10.1109\/CVPR.2018.00602. http:\/\/openaccess.thecvf.com\/content_cvpr_2018\/html\/Li_Referring_Image_Segmentation_CVPR_2018_paper.html.","DOI":"10.1109\/CVPR.2018.00602"},{"key":"2325_CR31","doi-asserted-by":"publisher","unstructured":"Li, B., Qi, X., Lukasiewicz, T., & Torr, P. H.\u00a0S. (2020). Manigan: Text-guided image manipulation. In 2020 IEEE\/CVF conference on computer vision and pattern recognition, CVPR 2020, Seattle, WA, USA, June 13\u201319, 2020 (pp. 7877\u20137886). Computer Vision Foundation\/IEEE. https:\/\/doi.org\/10.1109\/CVPR42600.2020.00790. https:\/\/openaccess.thecvf.com\/content_CVPR_2020\/html\/Li_ManiGAN_Text-Guided_Image_Manipulation_CVPR_2020_paper.html.","DOI":"10.1109\/CVPR42600.2020.00790"},{"key":"2325_CR32","doi-asserted-by":"publisher","unstructured":"Lin, T.-Y., Maire, M., Belongie, S. J., Hays, J., Perona, P., Ramanan, D., Doll\u00e1r, P., & Zitnick, C.\u00a0L. (2014). Microsoft COCO: Common objects in context. In Fleet, D. J., Pajdla, T., Schiele, B., & Tuytelaars, T. (eds.,) Computer Vision - ECCV 2014 - 13th European conference, Zurich, Switzerland, September 6\u201312, 2014, Proceedings, Part V, volume 8693 of Lecture Notes in Computer Science (pp. 740\u2013755). Springer. https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48.","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"2325_CR33","doi-asserted-by":"publisher","unstructured":"Liu, D., Zhang, H., Wu, F., & Zha, Z.-J. (2019). Learning to assemble neural module tree networks for visual grounding. In 2019 IEEE\/CVF international conference on computer vision, ICCV 2019, Seoul, Korea (South), October 27\u2013November 2, 2019 (pp. 4672\u20134681). IEEE. https:\/\/doi.org\/10.1109\/ICCV.2019.00477.","DOI":"10.1109\/ICCV.2019.00477"},{"key":"2325_CR34","doi-asserted-by":"publisher","unstructured":"Liu, Z., Lin, Y., Cao, Y., Hu, H., Wei, Y., Zhang, Z., Lin, S., & Guo, B. (2021). Swin transformer: Hierarchical vision transformer using shifted windows. In 2021 IEEE\/CVF international conference on computer vision, ICCV 2021, Montreal, QC, Canada, October 10\u201317, 2021 (pp. 9992\u201310002). IEEE. https:\/\/doi.org\/10.1109\/ICCV48922.2021.00986.","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"2325_CR35","doi-asserted-by":"publisher","unstructured":"Liu, C., Lin, Z., Shen, X., Yang, J., Lu, X., & Yuille, A.\u00a0L. (2017b). Recurrent multimodal interaction for referring image segmentation. In IEEE international conference on computer vision, ICCV 2017, Venice, Italy, October 22\u201329, 2017 (pp. 1280\u20131289). IEEE Computer Society. https:\/\/doi.org\/10.1109\/ICCV.2017.143.","DOI":"10.1109\/ICCV.2017.143"},{"key":"2325_CR36","doi-asserted-by":"publisher","unstructured":"Liu, J., Wang, L., & Yang, M.-H. (2017a). Referring expression generation and comprehension via attributes. In IEEE International conference on computer vision, ICCV 2017, Venice, Italy, October 22\u201329, 2017 (pp. 4866\u20134874). IEEE Computer Society. https:\/\/doi.org\/10.1109\/ICCV.2017.520.","DOI":"10.1109\/ICCV.2017.520"},{"key":"2325_CR37","doi-asserted-by":"publisher","first-page":"3657","DOI":"10.1109\/TMM.2022.3163578","volume":"25","author":"C Liu","year":"2023","unstructured":"Liu, C., Jiang, X., & Ding, H. (2023). Instance-specific feature propagation for referring segmentation. IEEE Transactions on Multimedia, 25, 3657\u20133667. https:\/\/doi.org\/10.1109\/TMM.2022.3163578","journal-title":"IEEE Transactions on Multimedia"},{"key":"2325_CR38","doi-asserted-by":"publisher","first-page":"5244","DOI":"10.1109\/TIP.2020.2979010","volume":"29","author":"J Liu","year":"2020","unstructured":"Liu, J., Wang, W., Wang, L., & Yang, M.-H. (2020). Attribute-guided attention for referring expression generation and comprehension. IEEE Transactions on Image Processing, 29, 5244\u20135258. https:\/\/doi.org\/10.1109\/TIP.2020.2979010","journal-title":"IEEE Transactions on Image Processing"},{"key":"2325_CR39","doi-asserted-by":"publisher","unstructured":"Lopes, L. S., & Teixeira, A. J. S. (2000). Human-robot interaction through spoken language dialogue. In IEEE\/RSJ international conference on intelligent robots and systems, IROS 2000, October 30\u2013November 5, 2000, Takamatsu, Japan (pp. 528\u2013534). IEEE. https:\/\/doi.org\/10.1109\/IROS.2000.894658.","DOI":"10.1109\/IROS.2000.894658"},{"key":"2325_CR40","unstructured":"Loshchilov, I., & Hutter, F. (2019). Decoupled weight decay regularization. In 7th international conference on learning representations, ICLR 2019, New Orleans, LA, USA, May 6\u20139, 2019. OpenReview.net. https:\/\/openreview.net\/forum?id=Bkg6RiCqY7."},{"key":"2325_CR41","doi-asserted-by":"publisher","unstructured":"Luo, G., Zhou, Y., Ji, R., Sun, X., Su, J., Lin, C.-W., & Tian, Q. (2020b). Cascade grouped attention network for referring expression segmentation. In Chen, C. W., Cucchiara, R., Hua, X.-S., Qi, G.-J., Ricci, E., Zhang, Z., & Zimmermann, R. (eds.), MM \u201920: The 28th ACM international conference on multimedia, virtual event\/Seattle, WA, USA, October 12\u201316, 2020 (pp. 1274\u20131282). ACM. https:\/\/doi.org\/10.1145\/3394171.3414006.","DOI":"10.1145\/3394171.3414006"},{"key":"2325_CR42","doi-asserted-by":"publisher","unstructured":"Luo, G., Zhou, Y., Sun, X., Cao, L., Wu, C., Deng, C., & Ji, R. (2020a). Multi-task collaborative network for joint referring expression comprehension and segmentation. In 2020 IEEE\/CVF conference on computer vision and pattern recognition, CVPR 2020, Seattle, WA, USA, June 13\u201319, 2020 (pp. 10031\u201310040). Computer Vision Foundation\/IEEE. https:\/\/doi.org\/10.1109\/CVPR42600.2020.01005. https:\/\/openaccess.thecvf.com\/content_CVPR_2020\/html\/Luo_Multi-Task_Collaborative_Network_for_Joint_Referring_Expression_Comprehension_and_Segmentation_CVPR_2020_paper.html.","DOI":"10.1109\/CVPR42600.2020.01005"},{"key":"2325_CR43","doi-asserted-by":"publisher","unstructured":"Mao, J., Huang, J., Toshev, A., Camburu, O., Yuille, A. L., & Murphy, K. (2016). Generation and comprehension of unambiguous object descriptions. In 2016 IEEE conference on computer vision and pattern recognition, CVPR 2016, Las Vegas, NV, USA, June 27\u201330, 2016 (pp. 11\u201320). IEEE Computer Society. https:\/\/doi.org\/10.1109\/CVPR.2016.9.","DOI":"10.1109\/CVPR.2016.9"},{"key":"2325_CR44","doi-asserted-by":"publisher","unstructured":"Margffoy-Tuay, E., P\u00e9rez, J. C., Botero, E., & Arbel\u00e1ez, P. (2018). Dynamic multimodal instance segmentation guided by natural language queries. In Ferrari, V., Hebert, M., Sminchisescu, C., & Weiss, Y. (eds.,) Computer vision - ECCV 2018 - 15th European conference, Munich, Germany, September 8\u201314, 2018, proceedings, Part XI, volume 11215 of Lecture Notes in Computer Science (pp. 656\u2013672). Springer. https:\/\/doi.org\/10.1007\/978-3-030-01252-6_39.","DOI":"10.1007\/978-3-030-01252-6_39"},{"key":"2325_CR45","doi-asserted-by":"publisher","unstructured":"Nagaraja, V.\u00a0K., Morariu, V.\u00a0I., & Davis, L.\u00a0S. (2016). Modeling context between objects for referring expression understanding. In Leibe, B., Matas, J., Sebe, N., & Welling, M. (eds.), Computer vision - ECCV 2016 - 14th European conference, Amsterdam, The Netherlands, October 11\u201314, 2016, proceedings, Part IV, volume 9908 of Lecture Notes in Computer Science (pp. 792\u2013807). Springer. https:\/\/doi.org\/10.1007\/978-3-319-46493-0_48.","DOI":"10.1007\/978-3-319-46493-0_48"},{"issue":"1","key":"2325_CR46","doi-asserted-by":"publisher","first-page":"347","DOI":"10.1109\/TPAMI.2019.2926266","volume":"43","author":"Y Niu","year":"2021","unstructured":"Niu, Y., Zhang, H., Zhiwu, L., & Chang, S.-F. (2021). Variational context: Exploiting visual and textual context for grounding referring expressions. IEEE Transactions on Pattern Analysis and Machine Intelligence, 43(1), 347\u2013359. https:\/\/doi.org\/10.1109\/TPAMI.2019.2926266","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"2325_CR47","doi-asserted-by":"publisher","unstructured":"Plummer, B.\u00a0A., Wang, L., Cervantes, C.\u00a0M., Caicedo, J.\u00a0C., Hockenmaier, J., & Lazebnik, S. (2015). Flickr30k entities: Collecting region-to-phrase correspondences for richer image-to-sentence models. In 2015 IEEE international conference on computer vision, ICCV 2015, Santiago, Chile, December 7\u201313, 2015 (pp. 2641\u20132649). IEEE Computer Society. https:\/\/doi.org\/10.1109\/ICCV.2015.303.","DOI":"10.1109\/ICCV.2015.303"},{"key":"2325_CR48","unstructured":"Radford, A., Kim, J.\u00a0W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., Sastry, G., Askell, A., Mishkin, P., Clark, J., Krueger, G., & Sutskever, I. (2021). Learning transferable visual models from natural language supervision. In Meila, M., & Zhang, T. (eds.), Proceedings of the 38th international conference on machine learning, ICML 2021, 18\u201324 July 2021, virtual event, volume 139 of proceedings of machine learning research (pp. 8748\u20138763). PMLR. http:\/\/proceedings.mlr.press\/v139\/radford21a.html."},{"key":"2325_CR49","doi-asserted-by":"publisher","unstructured":"Rennie, S. J., Marcheret, E., Mroueh, Y., Ross, J., & Goel, V. (2017). Self-critical sequence training for image captioning. In 2017 IEEE conference on computer vision and pattern recognition, CVPR 2017, Honolulu, HI, USA, July 21\u201326, 2017 (pp. 1179\u20131195). IEEE Computer Society. https:\/\/doi.org\/10.1109\/CVPR.2017.131.","DOI":"10.1109\/CVPR.2017.131"},{"key":"2325_CR50","doi-asserted-by":"publisher","first-page":"2446","DOI":"10.1109\/TMM.2022.3147385","volume":"25","author":"M Sun","year":"2023","unstructured":"Sun, M., Suo, W., Wang, P., Zhang, Y., & Qi, W. (2023). A proposal-free one-stage framework for referring expression comprehension and generation via dense cross-attention. IEEE Transactions on Multimedia, 25, 2446\u20132458. https:\/\/doi.org\/10.1109\/TMM.2022.3147385","journal-title":"IEEE Transactions on Multimedia"},{"key":"2325_CR51","volume-title":"Reinforcement learning: An introduction","author":"RS Sutton","year":"2018","unstructured":"Sutton, R. S., & Barto, A. G. (2018). Reinforcement learning: An introduction. Cambridge: MIT Press."},{"key":"2325_CR52","doi-asserted-by":"publisher","unstructured":"Tanaka, M., Itamochi, T., Narioka, K., Sato, I., Ushiku, Y., & Harada, T. (2019). Generating easy-to-understand referring expressions for target identifications. In 2019 IEEE\/CVF international conference on computer vision, ICCV 2019, Seoul, Korea (South), October 27\u2013November 2, 2019 (pp. 5793\u20135802). IEEE. https:\/\/doi.org\/10.1109\/ICCV.2019.00589.","DOI":"10.1109\/ICCV.2019.00589"},{"key":"2325_CR53","doi-asserted-by":"publisher","unstructured":"Vedantam, R., Zitnick, C.\u00a0L., & Parikh, D. (2015). Cider: Consensus-based image description evaluation. In IEEE conference on computer vision and pattern recognition, CVPR 2015, Boston, MA, USA, June 7\u201312, 2015 (pp. 4566\u20134575). IEEE Computer Society. https:\/\/doi.org\/10.1109\/CVPR.2015.7299087.","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"2325_CR54","doi-asserted-by":"publisher","unstructured":"Wang, Z., Lu, Y., Li, Q., Tao, X., Guo, Y., Gong, M., & Liu, T. (2022). CRIS: Clip-driven referring image segmentation. In IEEE\/CVF conference on computer vision and pattern recognition, CVPR 2022, New Orleans, LA, USA, June 18\u201324, 2022 (pp. 11676\u201311685). IEEE. https:\/\/doi.org\/10.1109\/CVPR52688.2022.01139.","DOI":"10.1109\/CVPR52688.2022.01139"},{"key":"2325_CR55","doi-asserted-by":"publisher","unstructured":"Wang, L., Huang, J., Li, Y., Xu, K., Yang, Z., & Yu, D. (2021). Improving weakly supervised visual grounding by contrastive knowledge distillation. In IEEE conference on computer vision and pattern recognition, CVPR 2021, virtual, June 19\u201325, 2021 (pp. 14090\u201314100). Computer Vision Foundation\/IEEE. https:\/\/doi.org\/10.1109\/CVPR46437.2021.01387. https:\/\/openaccess.thecvf.com\/content\/CVPR2021\/html\/Wang_Improving_Weakly_Supervised_Visual_Grounding_by_Contrastive_Knowledge_Distillation_CVPR_2021_paper.html.","DOI":"10.1109\/CVPR46437.2021.01387"},{"key":"2325_CR56","doi-asserted-by":"publisher","unstructured":"Wang, L., Li, Y., & Lazebnik, S. (2016). Learning deep structure-preserving image-text embeddings. In 2016 IEEE conference on computer vision and pattern recognition, CVPR 2016, Las Vegas, NV, USA, June 27\u201330, 2016 (pp. 5005\u20135013). IEEE Computer Society. https:\/\/doi.org\/10.1109\/CVPR.2016.541.","DOI":"10.1109\/CVPR.2016.541"},{"key":"2325_CR57","doi-asserted-by":"publisher","first-page":"229","DOI":"10.1007\/BF00992696","volume":"8","author":"RJ Williams","year":"1992","unstructured":"Williams, R. J. (1992). Simple statistical gradient-following algorithms for connectionist reinforcement learning. Machine Learning, 8, 229\u2013256. https:\/\/doi.org\/10.1007\/BF00992696","journal-title":"Machine Learning"},{"key":"2325_CR58","doi-asserted-by":"publisher","unstructured":"Wu, C., Lin, Z., Cohen, S., Bui, T., & Maji, S. (2020). Phrasecut: Language-based image segmentation in the wild. In 2020 IEEE\/CVF conference on computer vision and pattern recognition, CVPR 2020, Seattle, WA, USA, June 13\u201319, 2020 (pp. 10213\u201310222). Computer Vision Foundation\/IEEE. https:\/\/doi.org\/10.1109\/CVPR42600.2020.01023. https:\/\/openaccess.thecvf.com\/content_CVPR_2020\/html\/Wu_PhraseCut_Language-Based_Image_Segmentation_in_the_Wild_CVPR_2020_paper.html.","DOI":"10.1109\/CVPR42600.2020.01023"},{"key":"2325_CR59","doi-asserted-by":"publisher","unstructured":"Xia, W., Yang, Y., Xue, J.-H., & Wu, B. (2021). Tedigan: Text-guided diverse face image generation and manipulation. In IEEE conference on computer vision and pattern recognition, CVPR 2021, virtual, June 19\u201325, 2021 (pp. 2256\u20132265). Computer Vision Foundation\/IEEE. https:\/\/doi.org\/10.1109\/CVPR46437.2021.00229. https:\/\/openaccess.thecvf.com\/content\/CVPR2021\/html\/Xia_TediGAN_Text-Guided_Diverse_Face_Image_Generation_and_Manipulation_CVPR_2021_paper.html.","DOI":"10.1109\/CVPR46437.2021.00229"},{"key":"2325_CR60","unstructured":"Xu, B., Kong, W., & Chen, J. (2017). Semi-supervised image captioning via reconstruction. In Proceedings of international conference on computer vision (pp. 4135\u20134144)."},{"key":"2325_CR61","doi-asserted-by":"publisher","unstructured":"Yang, S., Xia, M., Li, G., Zhou, H.-Y., & Yu, Y. (2021). Bottom-up shift and reasoning for referring image segmentation. In IEEE conference on computer vision and pattern recognition, CVPR 2021, virtual, June 19\u201325, 2021 (pp. 11266\u201311275). Computer Vision Foundation\/IEEE. https:\/\/doi.org\/10.1109\/CVPR46437.2021.01111. URL https:\/\/openaccess.thecvf.com\/content\/CVPR2021\/html\/Yang_Bottom-Up_Shift_and_Reasoning_for_Referring_Image_Segmentation_CVPR_2021_paper.html.","DOI":"10.1109\/CVPR46437.2021.01111"},{"key":"2325_CR62","doi-asserted-by":"publisher","unstructured":"Yang, Z., Wang, J., Tang, Y., Chen, K., Zhao, H., & Torr, P. H.\u00a0S. (2022). LAVT: Language-aware vision transformer for referring image segmentation. In IEEE\/CVF conference on computer vision and pattern recognition, CVPR 2022, New Orleans, LA, USA, June 18\u201324, 2022 (pp. 18134\u201318144). IEEE. https:\/\/doi.org\/10.1109\/CVPR52688.2022.01762.","DOI":"10.1109\/CVPR52688.2022.01762"},{"key":"2325_CR63","doi-asserted-by":"publisher","unstructured":"Yang, Z., Chen, T., Wang, L., & Luo, J. (2020). Improving one-stage visual grounding by recursive sub-query construction. In Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.), Computer vision - ECCV 2020 - 16th European conference, Glasgow, UK, August 23\u201328, 2020, proceedings, Part XIV, volume 12359 of Lecture Notes in Computer Science (pp. 387\u2013404). Springer. https:\/\/doi.org\/10.1007\/978-3-030-58568-6_23.","DOI":"10.1007\/978-3-030-58568-6_23"},{"key":"2325_CR64","doi-asserted-by":"publisher","unstructured":"Yang, Z., Gong, B., Wang, L., Huang, W., Yu, D., & Luo, J. (2019). A fast and accurate one-stage approach to visual grounding. In 2019 IEEE\/CVF international conference on computer vision, ICCV 2019, Seoul, Korea (South), October 27\u2013November 2, 2019 (pp. 4682\u20134692). IEEE. https:\/\/doi.org\/10.1109\/ICCV.2019.00478.","DOI":"10.1109\/ICCV.2019.00478"},{"key":"2325_CR65","doi-asserted-by":"publisher","unstructured":"Ye, L., Rochan, M., Liu, Z., & Wang, Y. (2019). Cross-modal self-attention network for referring image segmentation. In IEEE conference on computer vision and pattern recognition, CVPR 2019, Long Beach, CA, USA, June 16\u201320, 2019 (pp. 10502\u201310511). Computer Vision Foundation\/IEEE. https:\/\/doi.org\/10.1109\/CVPR.2019.01075. http:\/\/openaccess.thecvf.com\/content_CVPR_2019\/html\/Ye_Cross-Modal_Self-Attention_Network_for_Referring_Image_Segmentation_CVPR_2019_paper.html.","DOI":"10.1109\/CVPR.2019.01075"},{"key":"2325_CR66","doi-asserted-by":"publisher","unstructured":"Yu, L., Lin, Z., Shen, X., Yang, J., Lu, X., Bansal, M., & Berg, T.\u00a0L. (2018). Mattnet: Modular attention network for referring expression comprehension. In 2018 IEEE conference on computer vision and pattern recognition, CVPR 2018, Salt Lake City, UT, USA, June 18\u201322, 2018 (pp. 1307\u20131315). Computer Vision Foundation\/IEEE Computer Society. https:\/\/doi.org\/10.1109\/CVPR.2018.00142. http:\/\/openaccess.thecvf.com\/content_cvpr_2018\/html\/Yu_MAttNet_Modular_Attention_CVPR_2018_paper.html.","DOI":"10.1109\/CVPR.2018.00142"},{"key":"2325_CR67","doi-asserted-by":"publisher","unstructured":"Yu, L., Poirson, P., Yang, S., Berg, A. C., & Berg, T. L. (2016). Modeling context in referring expressions. In Leibe, B., Matas, J., Sebe, N., & Welling, M. (eds.), Computer vision - ECCV 2016 - 14th European conference, Amsterdam, The Netherlands, October 11\u201314, 2016, Proceedings, Part II, volume 9906 of Lecture Notes in Computer Science (pp. 69\u201385). Springer. https:\/\/doi.org\/10.1007\/978-3-319-46475-6_5.","DOI":"10.1007\/978-3-319-46475-6_5"},{"key":"2325_CR68","doi-asserted-by":"publisher","unstructured":"Yu, L., Tan, H., Bansal, M., & Berg, T. L. (2017). A joint speaker-listener-reinforcer model for referring expressions. In 2017 IEEE conference on computer vision and pattern recognition, CVPR 2017, Honolulu, HI, USA, July 21\u201326, 2017 (pp. 3521\u20133529). IEEE Computer Society. https:\/\/doi.org\/10.1109\/CVPR.2017.375.","DOI":"10.1109\/CVPR.2017.375"},{"key":"2325_CR69","doi-asserted-by":"publisher","unstructured":"Zheng, D., Kong, T., Jing, Y., Wang, J., & Wang, X. (2022). Towards unifying reference expression generation and comprehension. In Goldberg, Y., Kozareva, Z., Zhang, Y. (eds.), Proceedings of the 2022 conference on empirical methods in natural language processing, EMNLP 2022, Abu Dhabi, United Arab Emirates, December 7\u201311, 2022 (pp. 6598\u20136611). Association for Computational Linguistics. https:\/\/doi.org\/10.18653\/V1\/2022.EMNLP-MAIN.442.","DOI":"10.18653\/V1\/2022.EMNLP-MAIN.442"}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-024-02325-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11263-024-02325-y\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-024-02325-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,5,10]],"date-time":"2025-05-10T06:55:17Z","timestamp":1746860117000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11263-024-02325-y"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,1,23]]},"references-count":69,"journal-issue":{"issue":"6","published-print":{"date-parts":[[2025,6]]}},"alternative-id":["2325"],"URL":"https:\/\/doi.org\/10.1007\/s11263-024-02325-y","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"type":"print","value":"0920-5691"},{"type":"electronic","value":"1573-1405"}],"subject":[],"published":{"date-parts":[[2025,1,23]]},"assertion":[{"value":"21 March 2023","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"1 December 2024","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"23 January 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that they have no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}},{"value":"All authors consent to see their work published if accepted.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Consent for publication"}}]}}