{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,15]],"date-time":"2026-01-15T05:29:02Z","timestamp":1768454942363,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":83,"publisher":"ACM","license":[{"start":{"date-parts":[[2022,10,10]],"date-time":"2022-10-10T00:00:00Z","timestamp":1665360000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"the National Natural Science Foundation of China","award":["62122010, 61876177"],"award-info":[{"award-number":["62122010, 61876177"]}]},{"name":"the Key Research and Development Program of Zhejiang Province","award":["2022C01082"],"award-info":[{"award-number":["2022C01082"]}]},{"name":"the Fundamental Research Funds for the Central Universities"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2022,10,10]]},"DOI":"10.1145\/3503161.3548086","type":"proceedings-article","created":{"date-parts":[[2022,10,10]],"date-time":"2022-10-10T15:42:46Z","timestamp":1665416566000},"page":"5537-5546","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":10,"title":["PPMN: Pixel-Phrase Matching Network for One-Stage Panoptic Narrative Grounding"],"prefix":"10.1145","author":[{"given":"Zihan","family":"Ding","sequence":"first","affiliation":[{"name":"Beihang University, Beijing, China"}]},{"given":"Zi-han","family":"Ding","sequence":"additional","affiliation":[{"name":"Beihang University, Beijing, China"}]},{"given":"Tianrui","family":"Hui","sequence":"additional","affiliation":[{"name":"Chinese Academy of Sciences &amp; University of Chinese Academy of Sciences, Beijing, China"}]},{"given":"Junshi","family":"Huang","sequence":"additional","affiliation":[{"name":"Meituan, Beijing, China"}]},{"given":"Xiaoming","family":"Wei","sequence":"additional","affiliation":[{"name":"Meituan, Beijing, China"}]},{"given":"Xiaolin","family":"Wei","sequence":"additional","affiliation":[{"name":"Meituan, Beijing, China"}]},{"given":"Si","family":"Liu","sequence":"additional","affiliation":[{"name":"Beihang University, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2022,10,10]]},"reference":[{"key":"e_1_3_2_2_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01276"},{"key":"e_1_3_2_2_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.279"},{"key":"e_1_3_2_2_3_1","volume-title":"Jamie Ryan Kiros, and Geoffrey E Hinton","author":"Ba Jimmy Lei","year":"2016","unstructured":"Jimmy Lei Ba , Jamie Ryan Kiros, and Geoffrey E Hinton . 2016 . Layer normalization. arXiv preprint arXiv:1607.06450 (2016). Jimmy Lei Ba, Jamie Ryan Kiros, and Geoffrey E Hinton. 2016. Layer normalization. arXiv preprint arXiv:1607.06450 (2016)."},{"key":"e_1_3_2_2_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00438"},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.285"},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58577-8_7"},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01249"},{"key":"e_1_3_2_2_9_1","volume-title":"Per-pixel classification is not all you need for semantic segmentation. Advances in Neural Information Processing Systems 34","author":"Cheng Bowen","year":"2021","unstructured":"Bowen Cheng , Alex Schwing , and Alexander Kirillov . 2021. Per-pixel classification is not all you need for semantic segmentation. Advances in Neural Information Processing Systems 34 ( 2021 ). Bowen Cheng, Alex Schwing, and Alexander Kirillov. 2021. Per-pixel classification is not all you need for semantic segmentation. Advances in Neural Information Processing Systems 34 (2021)."},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475677"},{"key":"e_1_3_2_2_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00179"},{"key":"e_1_3_2_2_12_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin , Ming-Wei Chang , Kenton Lee , and Kristina Toutanova . 2018 . Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018). Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)."},{"key":"e_1_3_2_2_13_1","volume-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision. 16321-- 16330","author":"Ding Henghui","year":"2021","unstructured":"Henghui Ding , Chang Liu , Suchen Wang , and Xudong Jiang . 2021 . Visionlanguage transformer and query generation for referring segmentation . In Proceedings of the IEEE\/CVF International Conference on Computer Vision. 16321-- 16330 . Henghui Ding, Chang Liu, Suchen Wang, and Xudong Jiang. 2021. Visionlanguage transformer and query generation for referring segmentation. In Proceedings of the IEEE\/CVF International Conference on Computer Vision. 16321-- 16330."},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00491"},{"key":"e_1_3_2_2_15_1","volume-title":"Progressive multimodal interaction network for referring video object segmentation. The 3rd Large-scale Video Object Segmentation Challenge","author":"Ding Zihan","year":"2021","unstructured":"Zihan Ding , Tianrui Hui , Shaofei Huang , Si Liu , Xuan Luo , Junshi Huang , and Xiaoming Wei . 2021. Progressive multimodal interaction network for referring video object segmentation. The 3rd Large-scale Video Object Segmentation Challenge ( 2021 ), 7. Zihan Ding, Tianrui Hui, Shaofei Huang, Si Liu, Xuan Luo, Junshi Huang, and Xiaoming Wei. 2021. Progressive multimodal interaction network for referring video object segmentation. The 3rd Large-scale Video Object Segmentation Challenge (2021), 7."},{"key":"e_1_3_2_2_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00430"},{"key":"e_1_3_2_2_17_1","volume-title":"Encoder Fusion Network with Co-Attention Embedding for Referring Image Segmentation. In 2021 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). IEEE, 15501--15510","author":"Feng Guang","year":"2021","unstructured":"Guang Feng , Zhiwei Hu , Lihe Zhang , and Huchuan Lu . 2021 . Encoder Fusion Network with Co-Attention Embedding for Referring Image Segmentation. In 2021 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). IEEE, 15501--15510 . Guang Feng, Zhiwei Hu, Lihe Zhang, and Huchuan Lu. 2021. Encoder Fusion Network with Co-Attention Embedding for Referring Image Segmentation. In 2021 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). IEEE, 15501--15510."},{"key":"e_1_3_2_2_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00140"},{"key":"e_1_3_2_2_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.322"},{"key":"e_1_3_2_2_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_2_21_1","volume-title":"Long short-term memory. Neural computation 9, 8","author":"Hochreiter Sepp","year":"1997","unstructured":"Sepp Hochreiter and J\u00fcrgen Schmidhuber . 1997. Long short-term memory. Neural computation 9, 8 ( 1997 ), 1735--1780. Sepp Hochreiter and J\u00fcrgen Schmidhuber. 1997. Long short-term memory. Neural computation 9, 8 (1997), 1735--1780."},{"key":"e_1_3_2_2_22_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46448-0_7"},{"key":"e_1_3_2_2_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01050"},{"key":"e_1_3_2_2_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00417"},{"key":"e_1_3_2_2_25_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58607-2_4"},{"key":"e_1_3_2_2_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475222"},{"key":"e_1_3_2_2_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413902"},{"key":"e_1_3_2_2_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00180"},{"key":"e_1_3_2_2_29_1","volume-title":"Deep fragment embeddings for bidirectional image sentence mapping. Advances in neural information processing systems 27","author":"Karpathy Andrej","year":"2014","unstructured":"Andrej Karpathy , Armand Joulin , and Li F Fei-Fei . 2014. Deep fragment embeddings for bidirectional image sentence mapping. Advances in neural information processing systems 27 ( 2014 ). Andrej Karpathy, Armand Joulin, and Li F Fei-Fei. 2014. Deep fragment embeddings for bidirectional image sentence mapping. Advances in neural information processing systems 27 (2014)."},{"key":"e_1_3_2_2_30_1","volume-title":"Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980","author":"Kingma Diederik P","year":"2014","unstructured":"Diederik P Kingma and Jimmy Ba . 2014 . Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980 (2014). Diederik P Kingma and Jimmy Ba. 2014. Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980 (2014)."},{"key":"e_1_3_2_2_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00656"},{"key":"e_1_3_2_2_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00963"},{"key":"e_1_3_2_2_33_1","volume-title":"Imagenet classification with deep convolutional neural networks. Advances in neural information processing systems 25","author":"Krizhevsky Alex","year":"2012","unstructured":"Alex Krizhevsky , Ilya Sutskever , and Geoffrey E Hinton . 2012. Imagenet classification with deep convolutional neural networks. Advances in neural information processing systems 25 ( 2012 ). Alex Krizhevsky, Ilya Sutskever, and Geoffrey E Hinton. 2012. Imagenet classification with deep convolutional neural networks. Advances in neural information processing systems 25 (2012)."},{"key":"e_1_3_2_2_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475629"},{"key":"e_1_3_2_2_35_1","unstructured":"Liunian Harold Li Pengchuan Zhang Haotian Zhang Jianwei Yang Chunyuan Li Yiwu Zhong LijuanWang Lu Yuan Lei Zhang Jenq-Neng Hwang etal 2021. Grounded Language-Image Pre-training. arXiv preprint arXiv:2112.03857 (2021).  Liunian Harold Li Pengchuan Zhang Haotian Zhang Jianwei Yang Chunyuan Li Yiwu Zhong LijuanWang Lu Yuan Lei Zhang Jenq-Neng Hwang et al. 2021. Grounded Language-Image Pre-training. arXiv preprint arXiv:2112.03857 (2021)."},{"key":"e_1_3_2_2_36_1","volume-title":"Referring transformer: A one-step approach to multi-task visual grounding. Advances in Neural Information Processing Systems 34","author":"Li Muchen","year":"2021","unstructured":"Muchen Li and Leonid Sigal . 2021. Referring transformer: A one-step approach to multi-task visual grounding. Advances in Neural Information Processing Systems 34 ( 2021 ). Muchen Li and Leonid Sigal. 2021. Referring transformer: A one-step approach to multi-task visual grounding. Advances in Neural Information Processing Systems 34 (2021)."},{"key":"e_1_3_2_2_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01333"},{"key":"e_1_3_2_2_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00060"},{"key":"e_1_3_2_2_39_1","volume-title":"arXiv preprint arXiv:2109.03814","author":"Li Zhiqi","year":"2021","unstructured":"Zhiqi Li , Wenhai Wang , Enze Xie , Zhiding Yu , Anima Anandkumar , Jose M Alvarez , Tong Lu , and Ping Luo . 2021. Panoptic SegFormer . arXiv preprint arXiv:2109.03814 ( 2021 ). Zhiqi Li, Wenhai Wang, Enze Xie, Zhiding Yu, Anima Anandkumar, Jose M Alvarez, Tong Lu, and Ping Luo. 2021. Panoptic SegFormer. arXiv preprint arXiv:2109.03814 (2021)."},{"key":"e_1_3_2_2_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01089"},{"key":"e_1_3_2_2_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2022.3181516"},{"key":"e_1_3_2_2_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.106"},{"key":"e_1_3_2_2_43_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"e_1_3_2_2_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2021.3079993"},{"key":"e_1_3_2_2_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00205"},{"key":"e_1_3_2_2_46_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6833"},{"key":"e_1_3_2_2_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298965"},{"key":"e_1_3_2_2_48_1","volume-title":"Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. Advances in neural information processing systems 32","author":"Lu Jiasen","year":"2019","unstructured":"Jiasen Lu , Dhruv Batra , Devi Parikh , and Stefan Lee . 2019 . Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. Advances in neural information processing systems 32 (2019). Jiasen Lu, Dhruv Batra, Devi Parikh, and Stefan Lee. 2019. Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. Advances in neural information processing systems 32 (2019)."},{"key":"e_1_3_2_2_49_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3414006"},{"key":"e_1_3_2_2_50_1","volume-title":"Multi-Task Collaborative Network for Joint Referring Expression Comprehension and Segmentation. In 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). IEEE, 10031--10040","author":"Luo Gen","year":"2020","unstructured":"Gen Luo , Yiyi Zhou , Xiaoshuai Sun , Liujuan Cao , Chenglin Wu , Cheng Deng , and Rongrong Ji . 2020 . Multi-Task Collaborative Network for Joint Referring Expression Comprehension and Segmentation. In 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). IEEE, 10031--10040 . Gen Luo, Yiyi Zhou, Xiaoshuai Sun, Liujuan Cao, Chenglin Wu, Cheng Deng, and Rongrong Ji. 2020. Multi-Task Collaborative Network for Joint Referring Expression Comprehension and Segmentation. In 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). IEEE, 10031--10040."},{"key":"e_1_3_2_2_51_1","volume-title":"V-net: Fully convolutional neural networks for volumetric medical image segmentation. In 2016 fourth international conference on 3D vision (3DV)","author":"Milletari Fausto","year":"2016","unstructured":"Fausto Milletari , Nassir Navab , and Seyed-Ahmad Ahmadi . 2016 . V-net: Fully convolutional neural networks for volumetric medical image segmentation. In 2016 fourth international conference on 3D vision (3DV) . IEEE , 565--571. Fausto Milletari, Nassir Navab, and Seyed-Ahmad Ahmadi. 2016. V-net: Fully convolutional neural networks for volumetric medical image segmentation. In 2016 fourth international conference on 3D vision (3DV). IEEE, 565--571."},{"key":"e_1_3_2_2_52_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i15.17602"},{"key":"e_1_3_2_2_53_1","unstructured":"Vinod Nair and Geoffrey E Hinton. 2010. Rectified linear units improve restricted boltzmann machines. In Icml.  Vinod Nair and Geoffrey E Hinton. 2010. Rectified linear units improve restricted boltzmann machines. In Icml."},{"key":"e_1_3_2_2_54_1","volume-title":"Pytorch: An imperative style, high-performance deep learning library. Advances in neural information processing systems 32","author":"Paszke Adam","year":"2019","unstructured":"Adam Paszke , Sam Gross , Francisco Massa , Adam Lerer , James Bradbury , Gregory Chanan , Trevor Killeen , Zeming Lin , Natalia Gimelshein , Luca Antiga , 2019 . Pytorch: An imperative style, high-performance deep learning library. Advances in neural information processing systems 32 (2019). Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, et al. 2019. Pytorch: An imperative style, high-performance deep learning library. Advances in neural information processing systems 32 (2019)."},{"key":"e_1_3_2_2_55_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01258-8_16"},{"key":"e_1_3_2_2_56_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58558-7_38"},{"key":"e_1_3_2_2_57_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.91"},{"key":"e_1_3_2_2_58_1","volume-title":"VL-BERT: Pre-training of Generic Visual-Linguistic Representations. In International Conference on Learning Representations.","author":"Su Weijie","year":"2019","unstructured":"Weijie Su , Xizhou Zhu , Yue Cao , Bin Li , Lewei Lu , Furu Wei , and Jifeng Dai . 2019 . VL-BERT: Pre-training of Generic Visual-Linguistic Representations. In International Conference on Learning Representations. Weijie Su, Xizhou Zhu, Yue Cao, Bin Li, Lewei Lu, Furu Wei, and Jifeng Dai. 2019. VL-BERT: Pre-training of Generic Visual-Linguistic Representations. In International Conference on Learning Representations."},{"key":"e_1_3_2_2_59_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_17"},{"key":"e_1_3_2_2_60_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2022.3145407"},{"key":"e_1_3_2_2_61_1","doi-asserted-by":"publisher","DOI":"10.1007\/BF02289464"},{"key":"e_1_3_2_2_62_1","volume-title":"Attention is all you need. Advances in neural information processing systems 30","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani , Noam Shazeer , Niki Parmar , Jakob Uszkoreit , Llion Jones , Aidan N Gomez , Lukasz Kaiser , and Illia Polosukhin . 2017. Attention is all you need. Advances in neural information processing systems 30 ( 2017 ). Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_2_2_63_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00542"},{"key":"e_1_3_2_2_64_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2797921"},{"key":"e_1_3_2_2_65_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 1960--1968","author":"Wang Peng","unstructured":"Peng Wang , Qi Wu , Jiewei Cao , Chunhua Shen , Lianli Gao , and Anton van den Hengel. 2019. Neighbourhood watch: Referring expression comprehension via language-guided graph attention networks . In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 1960--1968 . Peng Wang, Qi Wu, Jiewei Cao, Chunhua Shen, Lianli Gao, and Anton van den Hengel. 2019. Neighbourhood watch: Referring expression comprehension via language-guided graph attention networks. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 1960--1968."},{"key":"e_1_3_2_2_66_1","volume-title":"Solov2: Dynamic and fast instance segmentation. Advances in Neural information processing systems 33","author":"Zhang Rufeng","year":"2020","unstructured":"XinlongWang, Rufeng Zhang , Tao Kong , Lei Li , and Chunhua Shen . 2020. Solov2: Dynamic and fast instance segmentation. Advances in Neural information processing systems 33 ( 2020 ), 17721--17732. XinlongWang, Rufeng Zhang, Tao Kong, Lei Li, and Chunhua Shen. 2020. Solov2: Dynamic and fast instance segmentation. Advances in Neural information processing systems 33 (2020), 17721--17732."},{"key":"e_1_3_2_2_67_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01261-8_1"},{"key":"e_1_3_2_2_68_1","unstructured":"Yuxin Wu Alexander Kirillov Francisco Massa Wan-Yen Lo and Ross Girshick. 2019. Detectron2. https:\/\/github.com\/facebookresearch\/detectron2  Yuxin Wu Alexander Kirillov Francisco Massa Wan-Yen Lo and Ross Girshick. 2019. Detectron2. https:\/\/github.com\/facebookresearch\/detectron2"},{"key":"e_1_3_2_2_69_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00902"},{"key":"e_1_3_2_2_70_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00474"},{"key":"e_1_3_2_2_71_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58529-7_35"},{"key":"e_1_3_2_2_72_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58568-6_23"},{"key":"e_1_3_2_2_73_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00478"},{"key":"e_1_3_2_2_74_1","volume-title":"LAVT: Language-Aware Vision Transformer for Referring Image Segmentation. arXiv preprint arXiv:2112.02244","author":"Yang Zhao","year":"2021","unstructured":"Zhao Yang , Jiaqi Wang , Yansong Tang , Kai Chen , Hengshuang Zhao , and Philip HS Torr . 2021 . LAVT: Language-Aware Vision Transformer for Referring Image Segmentation. arXiv preprint arXiv:2112.02244 (2021). Zhao Yang, Jiaqi Wang, Yansong Tang, Kai Chen, Hengshuang Zhao, and Philip HS Torr. 2021. LAVT: Language-Aware Vision Transformer for Referring Image Segmentation. arXiv preprint arXiv:2112.02244 (2021)."},{"key":"e_1_3_2_2_75_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475313"},{"key":"e_1_3_2_2_76_1","volume-title":"MAttNet: Modular Attention Network for Referring Expression Comprehension. In 2018 IEEE\/CVF Conference on Computer Vision and Pattern Recognition. IEEE, 1307--1315","author":"Yu Licheng","year":"2018","unstructured":"Licheng Yu , Zhe Lin , Xiaohui Shen , Jimei Yang , Xin Lu , Mohit Bansal , and Tamara L Berg . 2018 . MAttNet: Modular Attention Network for Referring Expression Comprehension. In 2018 IEEE\/CVF Conference on Computer Vision and Pattern Recognition. IEEE, 1307--1315 . Licheng Yu, Zhe Lin, Xiaohui Shen, Jimei Yang, Xin Lu, Mohit Bansal, and Tamara L Berg. 2018. MAttNet: Modular Attention Network for Referring Expression Comprehension. In 2018 IEEE\/CVF Conference on Computer Vision and Pattern Recognition. IEEE, 1307--1315."},{"key":"e_1_3_2_2_77_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413846"},{"key":"e_1_3_2_2_78_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00437"},{"key":"e_1_3_2_2_79_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475343"},{"key":"e_1_3_2_2_80_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2020.3039522"},{"key":"e_1_3_2_2_81_1","volume-title":"K-net: Towards unified image segmentation. Advances in Neural Information Processing Systems 34","author":"Zhang Wenwei","year":"2021","unstructured":"Wenwei Zhang , Jiangmiao Pang , Kai Chen , and Chen Change Loy . 2021 . K-net: Towards unified image segmentation. Advances in Neural Information Processing Systems 34 (2021). Wenwei Zhang, Jiangmiao Pang, Kai Chen, and Chen Change Loy. 2021. K-net: Towards unified image segmentation. Advances in Neural Information Processing Systems 34 (2021)."},{"key":"e_1_3_2_2_82_1","volume-title":"Cross-modality relevance for reasoning on language and vision. arXiv preprint arXiv:2005.06035","author":"Zheng Chen","year":"2020","unstructured":"Chen Zheng , Quan Guo , and Parisa Kordjamshidi . 2020. Cross-modality relevance for reasoning on language and vision. arXiv preprint arXiv:2005.06035 ( 2020 ). Chen Zheng, Quan Guo, and Parisa Kordjamshidi. 2020. Cross-modality relevance for reasoning on language and vision. arXiv preprint arXiv:2005.06035 (2020)."},{"key":"e_1_3_2_2_83_1","volume-title":"Objects as points. arXiv preprint arXiv:1904.07850","author":"Zhou Xingyi","year":"2019","unstructured":"Xingyi Zhou , Dequan Wang , and Philipp Kr\u00e4henb\u00fchl . 2019. Objects as points. arXiv preprint arXiv:1904.07850 ( 2019 ). Xingyi Zhou, Dequan Wang, and Philipp Kr\u00e4henb\u00fchl. 2019. Objects as points. arXiv preprint arXiv:1904.07850 (2019)."}],"event":{"name":"MM '22: The 30th ACM International Conference on Multimedia","location":"Lisboa Portugal","acronym":"MM '22","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 30th ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3503161.3548086","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3503161.3548086","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T19:00:18Z","timestamp":1750186818000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3503161.3548086"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,10,10]]},"references-count":83,"alternative-id":["10.1145\/3503161.3548086","10.1145\/3503161"],"URL":"https:\/\/doi.org\/10.1145\/3503161.3548086","relation":{},"subject":[],"published":{"date-parts":[[2022,10,10]]},"assertion":[{"value":"2022-10-10","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}