{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T04:12:19Z","timestamp":1750219939356,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":52,"publisher":"ACM","license":[{"start":{"date-parts":[[2022,11,7]],"date-time":"2022-11-07T00:00:00Z","timestamp":1667779200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"National Natural Science Foundation in China","award":["61672128"],"award-info":[{"award-number":["61672128"]}]},{"name":"Fundamental Research Fund for Central University","award":["DUT20TD107"],"award-info":[{"award-number":["DUT20TD107"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2022,11,7]]},"DOI":"10.1145\/3536221.3556570","type":"proceedings-article","created":{"date-parts":[[2022,11,4]],"date-time":"2022-11-04T15:54:14Z","timestamp":1667577254000},"page":"36-47","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Structured Multimodal Fusion Network for Referring Image Segmentation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-6614-8871","authenticated-orcid":false,"given":"Mingcheng","family":"Xue","sequence":"first","affiliation":[{"name":"Dalian University of Technology, SCHOOL OF SOFTWARE TECHNOLOGY, DALIAN UNIVERSITY OF TECHNOLOGY, China"}]},{"given":"Yu","family":"Liu","sequence":"additional","affiliation":[{"name":"Dalian University of Technology, SCHOOL OF SOFTWARE TECHNOLOGY, DALIAN UNIVERSITY OF TECHNOLOGY, China"}]},{"given":"Kaiping","family":"Xu","sequence":"additional","affiliation":[{"name":"Dalian University of Technology, SCHOOL OF SOFTWARE TECHNOLOGY, DALIAN UNIVERSITY OF TECHNOLOGY, China"}]},{"given":"Haiyang","family":"Zhang","sequence":"additional","affiliation":[{"name":"Dalian University of Technology, SCHOOL OF SOFTWARE TECHNOLOGY, DALIAN UNIVERSITY OF TECHNOLOGY, China"}]},{"given":"Chengyang","family":"Yu","sequence":"additional","affiliation":[{"name":"Dalian University of Technology, SCHOOL OF SOFTWARE TECHNOLOGY, DALIAN UNIVERSITY OF TECHNOLOGY, China"}]}],"member":"320","published-online":{"date-parts":[[2022,11,7]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2016.2644615"},{"key":"e_1_3_2_1_2_1","volume-title":"MUTAN: Multimodal Tucker Fusion for Visual Question Answering. In IEEE International Conference on Computer Vision, ICCV 2017","author":"Hedi","year":"2017","unstructured":"Hedi Ben-younes, R\u00e9mi Cad\u00e8ne , Matthieu Cord , and Nicolas Thome . 2017 . MUTAN: Multimodal Tucker Fusion for Visual Question Answering. In IEEE International Conference on Computer Vision, ICCV 2017 , Venice, Italy , October 22-29, 2017. IEEE Computer Society, 2631\u20132639. https:\/\/doi.org\/10.1109\/ICCV.2017.285 10.1109\/ICCV.2017.285 Hedi Ben-younes, R\u00e9mi Cad\u00e8ne, Matthieu Cord, and Nicolas Thome. 2017. MUTAN: Multimodal Tucker Fusion for Visual Question Answering. In IEEE International Conference on Computer Vision, ICCV 2017, Venice, Italy, October 22-29, 2017. IEEE Computer Society, 2631\u20132639. https:\/\/doi.org\/10.1109\/ICCV.2017.285"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"e_1_3_2_1_4_1","volume-title":"See-Through-Text Grouping for Referring Image Segmentation. In 2019 IEEE\/CVF International Conference on Computer Vision, ICCV 2019","author":"Chen Ding-Jie","year":"2019","unstructured":"Ding-Jie Chen , Songhao Jia , Yi-Chen Lo , Hwann-Tzong Chen , and Tyng-Luh Liu . 2019 . See-Through-Text Grouping for Referring Image Segmentation. In 2019 IEEE\/CVF International Conference on Computer Vision, ICCV 2019 , Seoul, Korea (South), October 27 - November 2, 2019. IEEE, 7453\u20137462. https:\/\/doi.org\/10.1109\/ICCV.2019.00755 10.1109\/ICCV.2019.00755 Ding-Jie Chen, Songhao Jia, Yi-Chen Lo, Hwann-Tzong Chen, and Tyng-Luh Liu. 2019. See-Through-Text Grouping for Referring Image Segmentation. In 2019 IEEE\/CVF International Conference on Computer Vision, ICCV 2019, Seoul, Korea (South), October 27 - November 2, 2019. IEEE, 7453\u20137462. https:\/\/doi.org\/10.1109\/ICCV.2019.00755"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1082"},{"key":"e_1_3_2_1_6_1","volume-title":"3rd International Conference on Learning Representations, ICLR","author":"Chen Liang-Chieh","year":"2015","unstructured":"Liang-Chieh Chen , George Papandreou , Iasonas Kokkinos , Kevin Murphy , and Alan\u00a0 L. Yuille . 2015. Semantic Image Segmentation with Deep Convolutional Nets and Fully Connected CRFs . In 3rd International Conference on Learning Representations, ICLR 2015 , San Diego, CA , USA, May 7-9, 2015, Conference Track Proceedings, Yoshua Bengio and Yann LeCun (Eds .). http:\/\/arxiv.org\/abs\/1412.7062 Liang-Chieh Chen, George Papandreou, Iasonas Kokkinos, Kevin Murphy, and Alan\u00a0L. Yuille. 2015. Semantic Image Segmentation with Deep Convolutional Nets and Fully Connected CRFs. In 3rd International Conference on Learning Representations, ICLR 2015, San Diego, CA, USA, May 7-9, 2015, Conference Track Proceedings, Yoshua Bengio and Yann LeCun (Eds.). http:\/\/arxiv.org\/abs\/1412.7062"},{"key":"e_1_3_2_1_7_1","unstructured":"Liang-Chieh Chen George Papandreou Florian Schroff and Hartwig Adam. 2017. Rethinking Atrous Convolution for Semantic Image Segmentation. CoRR abs\/1706.05587(2017). arXiv:1706.05587http:\/\/arxiv.org\/abs\/1706.05587  Liang-Chieh Chen George Papandreou Florian Schroff and Hartwig Adam. 2017. Rethinking Atrous Convolution for Semantic Image Segmentation. CoRR abs\/1706.05587(2017). arXiv:1706.05587http:\/\/arxiv.org\/abs\/1706.05587"},{"key":"e_1_3_2_1_8_1","volume-title":"Dual Path Networks. In Advances in Neural Information Processing Systems 30: Annual Conference on Neural Information Processing Systems 2017","author":"Chen Yunpeng","year":"2017","unstructured":"Yunpeng Chen , Jianan Li , Huaxin Xiao , Xiaojie Jin , Shuicheng Yan , and Jiashi Feng . 2017 . Dual Path Networks. In Advances in Neural Information Processing Systems 30: Annual Conference on Neural Information Processing Systems 2017 , December 4-9, 2017, Long Beach, CA, USA, Isabelle Guyon, Ulrike von Luxburg, Samy Bengio, Hanna\u00a0M. Wallach, Rob Fergus, S.\u00a0V.\u00a0N. Vishwanathan, and Roman Garnett (Eds.). 4467\u20134475. https:\/\/proceedings.neurips.cc\/paper\/ 2017\/hash\/f7e0b956540676a129760a3eae309294-Abstract.html Yunpeng Chen, Jianan Li, Huaxin Xiao, Xiaojie Jin, Shuicheng Yan, and Jiashi Feng. 2017. Dual Path Networks. In Advances in Neural Information Processing Systems 30: Annual Conference on Neural Information Processing Systems 2017, December 4-9, 2017, Long Beach, CA, USA, Isabelle Guyon, Ulrike von Luxburg, Samy Bengio, Hanna\u00a0M. Wallach, Rob Fergus, S.\u00a0V.\u00a0N. Vishwanathan, and Roman Garnett (Eds.). 4467\u20134475. https:\/\/proceedings.neurips.cc\/paper\/2017\/hash\/f7e0b956540676a129760a3eae309294-Abstract.html"},{"key":"e_1_3_2_1_9_1","volume-title":"The 28th ACM International Conference on Multimedia, Virtual Event \/ Seattle, WA, USA","author":"Cheng Yu","year":"2020","unstructured":"Yu Cheng , Zhe Gan , Yitong Li , Jingjing Liu , and Jianfeng Gao . 2020 . Sequential Attention GAN for Interactive Image Editing. In MM \u201920 : The 28th ACM International Conference on Multimedia, Virtual Event \/ Seattle, WA, USA , October 12-16, 2020, Chang\u00a0Wen Chen, Rita Cucchiara, Xian-Sheng Hua, Guo-Jun Qi, Elisa Ricci, Zhengyou Zhang, and Roger Zimmermann (Eds.). ACM, 4383\u20134391. https:\/\/doi.org\/10.1145\/3394171.3413551 10.1145\/3394171.3413551 Yu Cheng, Zhe Gan, Yitong Li, Jingjing Liu, and Jianfeng Gao. 2020. Sequential Attention GAN for Interactive Image Editing. In MM \u201920: The 28th ACM International Conference on Multimedia, Virtual Event \/ Seattle, WA, USA, October 12-16, 2020, Chang\u00a0Wen Chen, Rita Cucchiara, Xian-Sheng Hua, Guo-Jun Qi, Elisa Ricci, Zhengyou Zhang, and Roger Zimmermann (Eds.). ACM, 4383\u20134391. https:\/\/doi.org\/10.1145\/3394171.3413551"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1179"},{"key":"e_1_3_2_1_11_1","volume-title":"TransVG: End-to-End Visual Grounding with Transformers. In 2021 IEEE\/CVF International Conference on Computer Vision, ICCV 2021","author":"Deng Jiajun","year":"2021","unstructured":"Jiajun Deng , Zhengyuan Yang , Tianlang Chen , Wengang Zhou , and Houqiang Li . 2021 . TransVG: End-to-End Visual Grounding with Transformers. In 2021 IEEE\/CVF International Conference on Computer Vision, ICCV 2021 , Montreal, QC, Canada , October 10-17, 2021. IEEE, 1749\u20131759. https:\/\/doi.org\/10.1109\/ICCV48922.2021.00179 10.1109\/ICCV48922.2021.00179 Jiajun Deng, Zhengyuan Yang, Tianlang Chen, Wengang Zhou, and Houqiang Li. 2021. TransVG: End-to-End Visual Grounding with Transformers. In 2021 IEEE\/CVF International Conference on Computer Vision, ICCV 2021, Montreal, QC, Canada, October 10-17, 2021. IEEE, 1749\u20131759. https:\/\/doi.org\/10.1109\/ICCV48922.2021.00179"},{"key":"e_1_3_2_1_12_1","volume-title":"Vision-Language Transformer and Query Generation for Referring Segmentation. In 2021 IEEE\/CVF International Conference on Computer Vision, ICCV 2021","author":"Ding Henghui","year":"2021","unstructured":"Henghui Ding , Chang Liu , Suchen Wang , and Xudong Jiang . 2021 . Vision-Language Transformer and Query Generation for Referring Segmentation. In 2021 IEEE\/CVF International Conference on Computer Vision, ICCV 2021 , Montreal, QC, Canada , October 10-17, 2021. IEEE, 16301\u201316310. https:\/\/doi.org\/10.1109\/ICCV48922.2021.01601 10.1109\/ICCV48922.2021.01601 Henghui Ding, Chang Liu, Suchen Wang, and Xudong Jiang. 2021. Vision-Language Transformer and Query Generation for Referring Segmentation. In 2021 IEEE\/CVF International Conference on Computer Vision, ICCV 2021, Montreal, QC, Canada, October 10-17, 2021. IEEE, 16301\u201316310. https:\/\/doi.org\/10.1109\/ICCV48922.2021.01601"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-009-0275-4"},{"key":"e_1_3_2_1_14_1","volume-title":"Encoder Fusion Network With Co-Attention Embedding for Referring Image Segmentation. In IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2021","author":"Feng Guang","year":"2021","unstructured":"Guang Feng , Zhiwei Hu , Lihe Zhang , and Huchuan Lu . 2021 . Encoder Fusion Network With Co-Attention Embedding for Referring Image Segmentation. In IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2021 , virtual, June 19-25, 2021. Computer Vision Foundation \/ IEEE, 15506\u201315515. https:\/\/openaccess.thecvf.com\/content\/CVPR 2021\/html\/Feng_Encoder_Fusion_Network_With_Co-Attention_Embedding_for_Referring_Image_Segmentation_CVPR_2021_paper.html Guang Feng, Zhiwei Hu, Lihe Zhang, and Huchuan Lu. 2021. Encoder Fusion Network With Co-Attention Embedding for Referring Image Segmentation. In IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2021, virtual, June 19-25, 2021. Computer Vision Foundation \/ IEEE, 15506\u201315515. https:\/\/openaccess.thecvf.com\/content\/CVPR2021\/html\/Feng_Encoder_Fusion_Network_With_Co-Attention_Embedding_for_Referring_Image_Segmentation_CVPR_2021_paper.html"},{"key":"e_1_3_2_1_15_1","volume-title":"Deep Residual Learning for Image Recognition. In 2016 IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2016","author":"He Kaiming","year":"2016","unstructured":"Kaiming He , Xiangyu Zhang , Shaoqing Ren , and Jian Sun . 2016 . Deep Residual Learning for Image Recognition. In 2016 IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2016 , Las Vegas, NV, USA , June 27-30, 2016. IEEE Computer Society, 770\u2013778. https:\/\/doi.org\/10.1109\/CVPR.2016.90 10.1109\/CVPR.2016.90 Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun. 2016. Deep Residual Learning for Image Recognition. In 2016 IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2016, Las Vegas, NV, USA, June 27-30, 2016. IEEE Computer Society, 770\u2013778. https:\/\/doi.org\/10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46448-0_7"},{"key":"e_1_3_2_1_17_1","volume-title":"Natural Language Object Retrieval. In 2016 IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2016","author":"Hu Ronghang","year":"2016","unstructured":"Ronghang Hu , Huazhe Xu , Marcus Rohrbach , Jiashi Feng , Kate Saenko , and Trevor Darrell . 2016 . Natural Language Object Retrieval. In 2016 IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2016 , Las Vegas, NV, USA , June 27-30, 2016. IEEE Computer Society, 4555\u20134564. https:\/\/doi.org\/10.1109\/CVPR.2016.493 10.1109\/CVPR.2016.493 Ronghang Hu, Huazhe Xu, Marcus Rohrbach, Jiashi Feng, Kate Saenko, and Trevor Darrell. 2016. Natural Language Object Retrieval. In 2016 IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2016, Las Vegas, NV, USA, June 27-30, 2016. IEEE Computer Society, 4555\u20134564. https:\/\/doi.org\/10.1109\/CVPR.2016.493"},{"key":"e_1_3_2_1_18_1","volume-title":"Bi-Directional Relationship Inferring Network for Referring Image Segmentation. In 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR 2020","author":"Hu Zhiwei","year":"2020","unstructured":"Zhiwei Hu , Guang Feng , Jiayu Sun , Lihe Zhang , and Huchuan Lu . 2020 . Bi-Directional Relationship Inferring Network for Referring Image Segmentation. In 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR 2020 , Seattle, WA, USA , June 13-19, 2020. Computer Vision Foundation \/ IEEE, 4423\u20134432. https:\/\/doi.org\/10.1109\/CVPR42600.2020.00448 10.1109\/CVPR42600.2020.00448 Zhiwei Hu, Guang Feng, Jiayu Sun, Lihe Zhang, and Huchuan Lu. 2020. Bi-Directional Relationship Inferring Network for Referring Image Segmentation. In 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR 2020, Seattle, WA, USA, June 13-19, 2020. Computer Vision Foundation \/ IEEE, 4423\u20134432. https:\/\/doi.org\/10.1109\/CVPR42600.2020.00448"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.naacl-main.195"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01050"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58607-2_4"},{"key":"e_1_3_2_1_22_1","volume-title":"Proceedings of the 32nd International Conference on Machine Learning, ICML 2015, Lille, France, 6-11 July 2015(JMLR Workshop and Conference Proceedings, Vol.\u00a037)","author":"Ioffe Sergey","year":"2015","unstructured":"Sergey Ioffe and Christian Szegedy . 2015 . Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift . In Proceedings of the 32nd International Conference on Machine Learning, ICML 2015, Lille, France, 6-11 July 2015(JMLR Workshop and Conference Proceedings, Vol.\u00a037) , Francis\u00a0R. Bach and David\u00a0M. Blei (Eds.). JMLR.org, 448\u2013456. http:\/\/proceedings.mlr.press\/v37\/ioffe15.html Sergey Ioffe and Christian Szegedy. 2015. Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift. In Proceedings of the 32nd International Conference on Machine Learning, ICML 2015, Lille, France, 6-11 July 2015(JMLR Workshop and Conference Proceedings, Vol.\u00a037), Francis\u00a0R. Bach and David\u00a0M. Blei (Eds.). JMLR.org, 448\u2013456. http:\/\/proceedings.mlr.press\/v37\/ioffe15.html"},{"key":"e_1_3_2_1_23_1","volume-title":"ACM Multimedia Conference","author":"Jiao Yang","year":"2021","unstructured":"Yang Jiao , Zequn Jie , Weixin Luo , Jingjing Chen , Yu-Gang Jiang , Xiaolin Wei , and Lin Ma . 2021 . Two-stage Visual Cues Enhancement Network for Referring Image Segmentation. In MM \u201921 : ACM Multimedia Conference , Virtual Event, China, October 20 - 24 , 2021, Heng\u00a0Tao Shen, Yueting Zhuang, John\u00a0R. Smith, Yang Yang, Pablo Cesar, Florian Metze, and Balakrishnan Prabhakaran (Eds.). ACM, 1331\u20131340. https:\/\/doi.org\/10.1145\/3474085.3475222 10.1145\/3474085.3475222 Yang Jiao, Zequn Jie, Weixin Luo, Jingjing Chen, Yu-Gang Jiang, Xiaolin Wei, and Lin Ma. 2021. Two-stage Visual Cues Enhancement Network for Referring Image Segmentation. In MM \u201921: ACM Multimedia Conference, Virtual Event, China, October 20 - 24, 2021, Heng\u00a0Tao Shen, Yueting Zhuang, John\u00a0R. Smith, Yang Yang, Pablo Cesar, Florian Metze, and Balakrishnan Prabhakaran (Eds.). ACM, 1331\u20131340. https:\/\/doi.org\/10.1145\/3474085.3475222"},{"key":"e_1_3_2_1_24_1","volume-title":"Locate Then Segment: A Strong Pipeline for Referring Image Segmentation. In IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2021","author":"Jing Ya","year":"2021","unstructured":"Ya Jing , Tao Kong , Wei Wang , Liang Wang , Lei Li , and Tieniu Tan . 2021 . Locate Then Segment: A Strong Pipeline for Referring Image Segmentation. In IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2021 , virtual, June 19-25, 2021. Computer Vision Foundation \/ IEEE, 9858\u20139867. https:\/\/openaccess.thecvf.com\/content\/CVPR 2021\/html\/Jing_Locate_Then_Segment_A_Strong_Pipeline_for_Referring_Image_Segmentation_CVPR_2021_paper.html Ya Jing, Tao Kong, Wei Wang, Liang Wang, Lei Li, and Tieniu Tan. 2021. Locate Then Segment: A Strong Pipeline for Referring Image Segmentation. In IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2021, virtual, June 19-25, 2021. Computer Vision Foundation \/ IEEE, 9858\u20139867. https:\/\/openaccess.thecvf.com\/content\/CVPR2021\/html\/Jing_Locate_Then_Segment_A_Strong_Pipeline_for_Referring_Image_Segmentation_CVPR_2021_paper.html"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1086"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00419"},{"key":"e_1_3_2_1_27_1","volume-title":"Kingma and Jimmy Ba","author":"P.","year":"2015","unstructured":"Diederik\u00a0 P. Kingma and Jimmy Ba . 2015 . Adam : A Method for Stochastic Optimization. In 3rd International Conference on Learning Representations, ICLR 2015, San Diego, CA, USA, May 7-9, 2015, Conference Track Proceedings, Yoshua Bengio and Yann LeCun (Eds .). http:\/\/arxiv.org\/abs\/1412.6980 Diederik\u00a0P. Kingma and Jimmy Ba. 2015. Adam: A Method for Stochastic Optimization. In 3rd International Conference on Learning Representations, ICLR 2015, San Diego, CA, USA, May 7-9, 2015, Conference Track Proceedings, Yoshua Bengio and Yann LeCun (Eds.). http:\/\/arxiv.org\/abs\/1412.6980"},{"key":"e_1_3_2_1_28_1","volume-title":"Advances in Neural Information Processing Systems 24: 25th Annual Conference on Neural Information Processing Systems","author":"Kr\u00e4henb\u00fchl Philipp","year":"2011","unstructured":"Philipp Kr\u00e4henb\u00fchl and Vladlen Koltun . 2011. Efficient Inference in Fully Connected CRFs with Gaussian Edge Potentials . In Advances in Neural Information Processing Systems 24: 25th Annual Conference on Neural Information Processing Systems 2011 . Proceedings of a meeting held 12-14 December 2011, Granada, Spain, John Shawe-Taylor, Richard\u00a0S. Zemel, Peter\u00a0L. Bartlett, Fernando C.\u00a0N. Pereira, and Kilian\u00a0Q. Weinberger (Eds .). 109\u2013117. https:\/\/proceedings.neurips.cc\/paper\/2011\/hash\/beda24c1e1b46055dff2c39c98fd6fc1-Abstract.html Philipp Kr\u00e4henb\u00fchl and Vladlen Koltun. 2011. Efficient Inference in Fully Connected CRFs with Gaussian Edge Potentials. In Advances in Neural Information Processing Systems 24: 25th Annual Conference on Neural Information Processing Systems 2011. Proceedings of a meeting held 12-14 December 2011, Granada, Spain, John Shawe-Taylor, Richard\u00a0S. Zemel, Peter\u00a0L. Bartlett, Fernando C.\u00a0N. Pereira, and Kilian\u00a0Q. Weinberger (Eds.). 109\u2013117. https:\/\/proceedings.neurips.cc\/paper\/2011\/hash\/beda24c1e1b46055dff2c39c98fd6fc1-Abstract.html"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00602"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01089"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2021.3074008"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"e_1_3_2_1_33_1","volume-title":"Recurrent Multimodal Interaction for Referring Image Segmentation. In IEEE International Conference on Computer Vision, ICCV 2017","author":"Liu Chenxi","year":"2017","unstructured":"Chenxi Liu , Zhe Lin , Xiaohui Shen , Jimei Yang , Xin Lu , and Alan\u00a0 L. Yuille . 2017 . Recurrent Multimodal Interaction for Referring Image Segmentation. In IEEE International Conference on Computer Vision, ICCV 2017 , Venice, Italy , October 22-29, 2017. IEEE Computer Society, 1280\u20131289. https:\/\/doi.org\/10.1109\/ICCV.2017.143 10.1109\/ICCV.2017.143 Chenxi Liu, Zhe Lin, Xiaohui Shen, Jimei Yang, Xin Lu, and Alan\u00a0L. Yuille. 2017. Recurrent Multimodal Interaction for Referring Image Segmentation. In IEEE International Conference on Computer Vision, ICCV 2017, Venice, Italy, October 22-29, 2017. IEEE Computer Society, 1280\u20131289. https:\/\/doi.org\/10.1109\/ICCV.2017.143"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00477"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298965"},{"key":"e_1_3_2_1_37_1","volume-title":"The 28th ACM International Conference on Multimedia, Virtual Event \/ Seattle, WA, USA","author":"Luo Gen","year":"2020","unstructured":"Gen Luo , Yiyi Zhou , Rongrong Ji , Xiaoshuai Sun , Jinsong Su , Chia-Wen Lin , and Qi Tian . 2020 . Cascade Grouped Attention Network for Referring Expression Segmentation. In MM \u201920 : The 28th ACM International Conference on Multimedia, Virtual Event \/ Seattle, WA, USA , October 12-16, 2020, Chang\u00a0Wen Chen, Rita Cucchiara, Xian-Sheng Hua, Guo-Jun Qi, Elisa Ricci, Zhengyou Zhang, and Roger Zimmermann (Eds.). ACM, 1274\u20131282. https:\/\/doi.org\/10.1145\/3394171.3414006 10.1145\/3394171.3414006 Gen Luo, Yiyi Zhou, Rongrong Ji, Xiaoshuai Sun, Jinsong Su, Chia-Wen Lin, and Qi Tian. 2020. Cascade Grouped Attention Network for Referring Expression Segmentation. In MM \u201920: The 28th ACM International Conference on Multimedia, Virtual Event \/ Seattle, WA, USA, October 12-16, 2020, Chang\u00a0Wen Chen, Rita Cucchiara, Xian-Sheng Hua, Guo-Jun Qi, Elisa Ricci, Zhengyou Zhang, and Roger Zimmermann (Eds.). ACM, 1274\u20131282. https:\/\/doi.org\/10.1145\/3394171.3414006"},{"key":"e_1_3_2_1_38_1","volume-title":"Comprehension-Guided Referring Expressions. In 2017 IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2017","author":"Luo Ruotian","year":"2017","unstructured":"Ruotian Luo and Gregory Shakhnarovich . 2017 . Comprehension-Guided Referring Expressions. In 2017 IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2017 , Honolulu, HI, USA , July 21-26, 2017. IEEE Computer Society, 3125\u20133134. https:\/\/doi.org\/10.1109\/CVPR.2017.333 10.1109\/CVPR.2017.333 Ruotian Luo and Gregory Shakhnarovich. 2017. Comprehension-Guided Referring Expressions. In 2017 IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2017, Honolulu, HI, USA, July 21-26, 2017. IEEE Computer Society, 3125\u20133134. https:\/\/doi.org\/10.1109\/CVPR.2017.333"},{"key":"e_1_3_2_1_39_1","volume-title":"Generation and Comprehension of Unambiguous Object Descriptions. In 2016 IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2016","author":"Mao Junhua","year":"2016","unstructured":"Junhua Mao , Jonathan Huang , Alexander Toshev , Oana Camburu , Alan\u00a0 L. Yuille , and Kevin Murphy . 2016 . Generation and Comprehension of Unambiguous Object Descriptions. In 2016 IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2016 , Las Vegas, NV, USA , June 27-30, 2016. IEEE Computer Society, 11\u201320. https:\/\/doi.org\/10.1109\/CVPR.2016.9 10.1109\/CVPR.2016.9 Junhua Mao, Jonathan Huang, Alexander Toshev, Oana Camburu, Alan\u00a0L. Yuille, and Kevin Murphy. 2016. Generation and Comprehension of Unambiguous Object Descriptions. In 2016 IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2016, Las Vegas, NV, USA, June 27-30, 2016. IEEE Computer Society, 11\u201320. https:\/\/doi.org\/10.1109\/CVPR.2016.9"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01252-6_39"},{"key":"e_1_3_2_1_41_1","volume-title":"Glove: Global Vectors for Word Representation. In Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing, EMNLP","author":"Pennington Jeffrey","year":"2014","unstructured":"Jeffrey Pennington , Richard Socher , and Christopher\u00a0 D. Manning . 2014 . Glove: Global Vectors for Word Representation. In Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing, EMNLP 2014, October 25-29, 2014, Doha, Qatar , A meeting of SIGDAT, a Special Interest Group of the ACL, Alessandro Moschitti, Bo\u00a0Pang, and Walter Daelemans (Eds.). ACL, 1532\u20131543. https:\/\/doi.org\/10.3115\/v1\/d14-1162 10.3115\/v1 Jeffrey Pennington, Richard Socher, and Christopher\u00a0D. Manning. 2014. Glove: Global Vectors for Word Representation. In Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing, EMNLP 2014, October 25-29, 2014, Doha, Qatar, A meeting of SIGDAT, a Special Interest Group of the ACL, Alessandro Moschitti, Bo\u00a0Pang, and Walter Daelemans (Eds.). ACL, 1532\u20131543. https:\/\/doi.org\/10.3115\/v1\/d14-1162"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-015-0816-y"},{"key":"e_1_3_2_1_43_1","volume-title":"Zero-Shot Grounding of Objects From Natural Language Queries. In 2019 IEEE\/CVF International Conference on Computer Vision, ICCV 2019","author":"Sadhu Arka","year":"2019","unstructured":"Arka Sadhu , Kan Chen , and Ram Nevatia . 2019 . Zero-Shot Grounding of Objects From Natural Language Queries. In 2019 IEEE\/CVF International Conference on Computer Vision, ICCV 2019 , Seoul, Korea (South), October 27 - November 2, 2019. IEEE, 4693\u20134702. https:\/\/doi.org\/10.1109\/ICCV.2019.00479 10.1109\/ICCV.2019.00479 Arka Sadhu, Kan Chen, and Ram Nevatia. 2019. Zero-Shot Grounding of Objects From Natural Language Queries. In 2019 IEEE\/CVF International Conference on Computer Vision, ICCV 2019, Seoul, Korea (South), October 27 - November 2, 2019. IEEE, 4693\u20134702. https:\/\/doi.org\/10.1109\/ICCV.2019.00479"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/78.650093"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01231-1_3"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2018.XIV.028"},{"key":"e_1_3_2_1_47_1","volume-title":"Advances in Neural Information Processing Systems 30: Annual Conference on Neural Information Processing Systems 2017","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani , Noam Shazeer , Niki Parmar , Jakob Uszkoreit , Llion Jones , Aidan\u00a0 N. Gomez , Lukasz Kaiser , and Illia Polosukhin . 2017 . Attention is All you Need . In Advances in Neural Information Processing Systems 30: Annual Conference on Neural Information Processing Systems 2017 , December 4-9, 2017, Long Beach, CA, USA, Isabelle Guyon, Ulrike von Luxburg, Samy Bengio, Hanna\u00a0M. Wallach, Rob Fergus, S.\u00a0V.\u00a0N. Vishwanathan, and Roman Garnett (Eds.). 5998\u20136008. https:\/\/proceedings.neurips.cc\/paper\/ 2017\/hash\/3f5ee243547dee91fbd053c1c4a845aa-Abstract.html Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan\u00a0N. Gomez, Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is All you Need. In Advances in Neural Information Processing Systems 30: Annual Conference on Neural Information Processing Systems 2017, December 4-9, 2017, Long Beach, CA, USA, Isabelle Guyon, Ulrike von Luxburg, Samy Bengio, Hanna\u00a0M. Wallach, Rob Fergus, S.\u00a0V.\u00a0N. Vishwanathan, and Roman Garnett (Eds.). 5998\u20136008. https:\/\/proceedings.neurips.cc\/paper\/2017\/hash\/3f5ee243547dee91fbd053c1c4a845aa-Abstract.html"},{"key":"e_1_3_2_1_48_1","volume-title":"Bottom-Up Shift and Reasoning for Referring Image Segmentation. In IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2021","author":"Yang Sibei","year":"2021","unstructured":"Sibei Yang , Meng Xia , Guanbin Li , Hong-Yu Zhou , and Yizhou Yu . 2021 . Bottom-Up Shift and Reasoning for Referring Image Segmentation. In IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2021 , virtual, June 19-25, 2021. Computer Vision Foundation \/ IEEE, 11266\u201311275. https:\/\/openaccess.thecvf.com\/content\/CVPR 2021\/html\/Yang_Bottom-Up_Shift_and_Reasoning_for_Referring_Image_Segmentation_CVPR_2021_paper.html Sibei Yang, Meng Xia, Guanbin Li, Hong-Yu Zhou, and Yizhou Yu. 2021. Bottom-Up Shift and Reasoning for Referring Image Segmentation. In IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2021, virtual, June 19-25, 2021. Computer Vision Foundation \/ IEEE, 11266\u201311275. https:\/\/openaccess.thecvf.com\/content\/CVPR2021\/html\/Yang_Bottom-Up_Shift_and_Reasoning_for_Referring_Image_Segmentation_CVPR_2021_paper.html"},{"key":"e_1_3_2_1_49_1","volume-title":"Cross-Modal Self-Attention Network for Referring Image Segmentation. In IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2019","author":"Ye Linwei","year":"2019","unstructured":"Linwei Ye , Mrigank Rochan , Zhi Liu , and Yang Wang . 2019 . Cross-Modal Self-Attention Network for Referring Image Segmentation. In IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2019 , Long Beach, CA, USA , June 16-20, 2019. Computer Vision Foundation \/ IEEE, 10502\u201310511. https:\/\/doi.org\/10.1109\/CVPR.2019.01075 10.1109\/CVPR.2019.01075 Linwei Ye, Mrigank Rochan, Zhi Liu, and Yang Wang. 2019. Cross-Modal Self-Attention Network for Referring Image Segmentation. In IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2019, Long Beach, CA, USA, June 16-20, 2019. Computer Vision Foundation \/ IEEE, 10502\u201310511. https:\/\/doi.org\/10.1109\/CVPR.2019.01075"},{"key":"e_1_3_2_1_50_1","volume-title":"MAttNet: Modular Attention Network for Referring Expression Comprehension. In 2018 IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2018","author":"Yu Licheng","year":"2018","unstructured":"Licheng Yu , Zhe Lin , Xiaohui Shen , Jimei Yang , Xin Lu , Mohit Bansal , and Tamara\u00a0 L. Berg . 2018 . MAttNet: Modular Attention Network for Referring Expression Comprehension. In 2018 IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2018 , Salt Lake City, UT, USA , June 18-22, 2018. Computer Vision Foundation \/ IEEE Computer Society, 1307\u20131315. https:\/\/doi.org\/10.1109\/CVPR.2018.00142 10.1109\/CVPR.2018.00142 Licheng Yu, Zhe Lin, Xiaohui Shen, Jimei Yang, Xin Lu, Mohit Bansal, and Tamara\u00a0L. Berg. 2018. MAttNet: Modular Attention Network for Referring Expression Comprehension. In 2018 IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2018, Salt Lake City, UT, USA, June 18-22, 2018. Computer Vision Foundation \/ IEEE Computer Society, 1307\u20131315. https:\/\/doi.org\/10.1109\/CVPR.2018.00142"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46475-6_5"},{"key":"e_1_3_2_1_52_1","volume-title":"Pyramid Scene Parsing Network. In 2017 IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2017","author":"Zhao Hengshuang","year":"2017","unstructured":"Hengshuang Zhao , Jianping Shi , Xiaojuan Qi , Xiaogang Wang , and Jiaya Jia . 2017 . Pyramid Scene Parsing Network. In 2017 IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2017 , Honolulu, HI, USA , July 21-26, 2017. IEEE Computer Society, 6230\u20136239. https:\/\/doi.org\/10.1109\/CVPR.2017.660 10.1109\/CVPR.2017.660 Hengshuang Zhao, Jianping Shi, Xiaojuan Qi, Xiaogang Wang, and Jiaya Jia. 2017. Pyramid Scene Parsing Network. In 2017 IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2017, Honolulu, HI, USA, July 21-26, 2017. IEEE Computer Society, 6230\u20136239. https:\/\/doi.org\/10.1109\/CVPR.2017.660"}],"event":{"name":"ICMI '22: INTERNATIONAL CONFERENCE ON MULTIMODAL INTERACTION","sponsor":["SIGCHI ACM Special Interest Group on Computer-Human Interaction"],"location":"Bengaluru India","acronym":"ICMI '22"},"container-title":["Proceedings of the 2022 International Conference on Multimodal Interaction"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3536221.3556570","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3536221.3556570","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T17:48:52Z","timestamp":1750182532000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3536221.3556570"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,11,7]]},"references-count":52,"alternative-id":["10.1145\/3536221.3556570","10.1145\/3536221"],"URL":"https:\/\/doi.org\/10.1145\/3536221.3556570","relation":{},"subject":[],"published":{"date-parts":[[2022,11,7]]},"assertion":[{"value":"2022-11-07","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}