{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,8]],"date-time":"2026-06-08T15:57:54Z","timestamp":1780934274516,"version":"3.54.1"},"reference-count":46,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100003696","name":"Electronics and Telecommunications Research Institute","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100003696","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100003662","name":"Korea Planning & Evaluation Institute of Industrial Technology","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100003662","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Pattern Recognition"],"published-print":{"date-parts":[[2026,12]]},"DOI":"10.1016\/j.patcog.2026.114006","type":"journal-article","created":{"date-parts":[[2026,5,23]],"date-time":"2026-05-23T06:46:47Z","timestamp":1779518807000},"page":"114006","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"PA","title":["Bidirectional token-masking autoencoder for Referring Image Segmentation"],"prefix":"10.1016","volume":"180","author":[{"given":"Minhyeok","family":"Lee","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Dogyoon","family":"Lee","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jungho","family":"Lee","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Suhwan","family":"Cho","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0394-6777","authenticated-orcid":false,"given":"Sangyoun","family":"Lee","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"78","reference":[{"key":"10.1016\/j.patcog.2026.114006_b1","doi-asserted-by":"crossref","unstructured":"Y. Song, X. Shao, K. Chen, W. Zhang, Z. Jing, M. Li, CLIPVG: text-guided image manipulation using differentiable vector graphics, in: Proceedings of the AAAI Conference on Artificial Intelligence, Vol. 37, 2023, pp. 2312\u20132320.","DOI":"10.1609\/aaai.v37i2.25326"},{"key":"10.1016\/j.patcog.2026.114006_b2","series-title":"European Conference on Computer Vision","first-page":"88","article-title":"Vqgan-clip: Open domain image generation and editing with natural language guidance","author":"Crowson","year":"2022"},{"key":"10.1016\/j.patcog.2026.114006_b3","doi-asserted-by":"crossref","unstructured":"X. Wang, Q. Huang, A. Celikyilmaz, J. Gao, D. Shen, Y.-F. Wang, W.Y. Wang, L. Zhang, Reinforced cross-modal matching and self-supervised imitation learning for vision-language navigation, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2019, pp. 6629\u20136638.","DOI":"10.1109\/CVPR.2019.00679"},{"key":"10.1016\/j.patcog.2026.114006_b4","doi-asserted-by":"crossref","unstructured":"Z. Yang, J. Wang, Y. Tang, K. Chen, H. Zhao, P.H. Torr, Lavt: Language-aware vision transformer for referring image segmentation, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2022, pp. 18155\u201318165.","DOI":"10.1109\/CVPR52688.2022.01762"},{"key":"10.1016\/j.patcog.2026.114006_b5","doi-asserted-by":"crossref","unstructured":"J. Tang, G. Zheng, C. Shi, S. Yang, Contrastive Grouping with Transformer for Referring Image Segmentation, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2023, pp. 23570\u201323580.","DOI":"10.1109\/CVPR52729.2023.02257"},{"key":"10.1016\/j.patcog.2026.114006_b6","doi-asserted-by":"crossref","unstructured":"K. He, X. Chen, S. Xie, Y. Li, P. Doll\u00e1r, R. Girshick, Masked autoencoders are scalable vision learners, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2022, pp. 16000\u201316009.","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"10.1016\/j.patcog.2026.114006_b7","doi-asserted-by":"crossref","unstructured":"W. Wang, H. Bao, L. Dong, J. Bjorck, Z. Peng, Q. Liu, K. Aggarwal, O.K. Mohammed, S. Singhal, S. Som, et al., Image as a Foreign Language: BEiT Pretraining for Vision and Vision-Language Tasks, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2023, pp. 19175\u201319186.","DOI":"10.1109\/CVPR52729.2023.01838"},{"key":"10.1016\/j.patcog.2026.114006_b8","series-title":"2009 IEEE Conference on Computer Vision and Pattern Recognition","first-page":"248","article-title":"Imagenet: A large-scale hierarchical image database","author":"Deng","year":"2009"},{"key":"10.1016\/j.patcog.2026.114006_b9","doi-asserted-by":"crossref","unstructured":"A. Kirillov, E. Mintun, N. Ravi, H. Mao, C. Rolland, L. Gustafson, T. Xiao, S. Whitehead, A.C. Berg, W.-Y. Lo, et al., Segment anything, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2023, pp. 4015\u20134026.","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"10.1016\/j.patcog.2026.114006_b10","doi-asserted-by":"crossref","unstructured":"J. Lee, S. Lee, J. Nam, S. Yu, J. Do, T. Taghavi, Weakly Supervised Referring Image Segmentation with Intra-Chunk and Inter-Chunk Consistency, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2023, pp. 21870\u201321881.","DOI":"10.1109\/ICCV51070.2023.01999"},{"key":"10.1016\/j.patcog.2026.114006_b11","doi-asserted-by":"crossref","unstructured":"F. Liu, Y. Liu, Y. Kong, K. Xu, L. Zhang, B. Yin, G. Hancke, R. Lau, Referring Image Segmentation Using Text Supervision, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2023, pp. 22124\u201322134.","DOI":"10.1109\/ICCV51070.2023.02022"},{"key":"10.1016\/j.patcog.2026.114006_b12","doi-asserted-by":"crossref","unstructured":"D. Kim, N. Kim, C. Lan, S. Kwak, Shatter and Gather: Learning Referring Image Segmentation with Text Supervision, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2023, pp. 15547\u201315557.","DOI":"10.1109\/ICCV51070.2023.01425"},{"key":"10.1016\/j.patcog.2026.114006_b13","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2024.111311","article-title":"Bimodal masked autoencoders with internal representation connections for electrocardiogram classification","volume":"161","author":"Wei","year":"2025","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2026.114006_b14","series-title":"Computer Vision\u2013ECCV 2016: 14th European Conference, Amsterdam, the Netherlands, October 11-14, 2016, Proceedings, Part II 14","first-page":"69","article-title":"Modeling context in referring expressions","author":"Yu","year":"2016"},{"key":"10.1016\/j.patcog.2026.114006_b15","doi-asserted-by":"crossref","unstructured":"J. Mao, J. Huang, A. Toshev, O. Camburu, A.L. Yuille, K. Murphy, Generation and comprehension of unambiguous object descriptions, in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2016, pp. 11\u201320.","DOI":"10.1109\/CVPR.2016.9"},{"key":"10.1016\/j.patcog.2026.114006_b16","series-title":"Computer Vision\u2013ECCV 2016: 14th European Conference, Amsterdam, the Netherlands, October 11\u201314, 2016, Proceedings, Part IV 14","first-page":"792","article-title":"Modeling context between objects for referring expression understanding","author":"Nagaraja","year":"2016"},{"key":"10.1016\/j.patcog.2026.114006_b17","article-title":"Generalized referring expression segmentation driven by instance-oriented queries","author":"Li","year":"2025","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2026.114006_b18","article-title":"Linguistic query-guided mask generation for referring image segmentation","volume":"172","author":"Zhichao","year":"2026","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2026.114006_b19","article-title":"LGD: Leveraging generative descriptions for zero-shot referring image segmentation","volume":"172","author":"Jiachen","year":"2026","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2026.114006_b20","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2024.110535","article-title":"Bidirectional correlation-driven inter-frame interaction transformer for referring video object segmentation","volume":"153","author":"Lan","year":"2024","journal-title":"Pattern Recognit."},{"issue":"10","key":"10.1016\/j.patcog.2026.114006_b21","doi-asserted-by":"crossref","first-page":"14727","DOI":"10.1109\/TNNLS.2023.3281372","article-title":"Referring image segmentation with Fine-Grained semantic funneling infusion","volume":"35","author":"Yang","year":"2024","journal-title":"IEEE Trans. Neural Netw. Learn. Syst."},{"issue":"12","key":"10.1016\/j.patcog.2026.114006_b22","doi-asserted-by":"crossref","first-page":"17754","DOI":"10.1109\/TNNLS.2023.3308550","article-title":"Global and local interactive perception network for referring image segmentation","volume":"35","author":"Liu","year":"2024","journal-title":"IEEE Trans. Neural Netw. Learn. Syst."},{"key":"10.1016\/j.patcog.2026.114006_b23","doi-asserted-by":"crossref","unstructured":"X. Wang, K. Zhao, R. Zhang, S. Ding, Y. Wang, W. Shen, Contrastmask: Contrastive learning to segment every thing, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2022, pp. 11604\u201311613.","DOI":"10.1109\/CVPR52688.2022.01131"},{"key":"10.1016\/j.patcog.2026.114006_b24","doi-asserted-by":"crossref","unstructured":"R. Xie, K. Pang, G.D. Bader, B. Wang, MAESTER: Masked Autoencoder Guided Segmentation at Pixel Resolution for Accurate, Self-Supervised Subcellular Structure Recognition, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2023, pp. 3292\u20133301.","DOI":"10.1109\/CVPR52729.2023.00321"},{"key":"10.1016\/j.patcog.2026.114006_b25","doi-asserted-by":"crossref","unstructured":"Q. Wu, T. Yang, Z. Liu, B. Wu, Y. Shan, A.B. Chan, DropMAE: Masked Autoencoders with Spatial-Attention Dropout for Tracking Tasks, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2023, pp. 14561\u201314571.","DOI":"10.1109\/CVPR52729.2023.01399"},{"key":"10.1016\/j.patcog.2026.114006_b26","article-title":"Cross-Modality masked autoencoder for infrared and visible image fusion","author":"Bi","year":"2025","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2026.114006_b27","series-title":"Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)","first-page":"4171","article-title":"BERT: Pre-training of deep bidirectional transformers for language understanding","author":"Devlin","year":"2019"},{"key":"10.1016\/j.patcog.2026.114006_b28","series-title":"Computer Vision\u2013ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6-12, 2014, Proceedings, Part V 13","first-page":"740","article-title":"Microsoft coco: Common objects in context","author":"Lin","year":"2014"},{"key":"10.1016\/j.patcog.2026.114006_b29","doi-asserted-by":"crossref","unstructured":"J. Liu, H. Ding, Z. Cai, Y. Zhang, R.K. Satzoda, V. Mahadevan, R. Manmatha, PolyFormer: Referring image segmentation as sequential polygon generation, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2023, pp. 18653\u201318663.","DOI":"10.1109\/CVPR52729.2023.01789"},{"key":"10.1016\/j.patcog.2026.114006_b30","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2024.110719","article-title":"Token-word mixer meets object-aware transformer for referring image segmentation","volume":"155","author":"Zhang","year":"2024","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2026.114006_b31","article-title":"Cmirnet: Cross-modal interactive reasoning network for referring image segmentation","author":"Xu","year":"2024","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"10.1016\/j.patcog.2026.114006_b32","doi-asserted-by":"crossref","unstructured":"C. Shang, Z. Song, H. Qiu, L. Wang, F. Meng, H. Li, Prompt-driven referring image segmentation with instance contrasting, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2024, pp. 4124\u20134134.","DOI":"10.1109\/CVPR52733.2024.00395"},{"key":"10.1016\/j.patcog.2026.114006_b33","series-title":"European Conference on Computer Vision","first-page":"125","article-title":"An efficient and effective transformer decoder-based framework for multi-task visual grounding","author":"Chen","year":"2024"},{"key":"10.1016\/j.patcog.2026.114006_b34","doi-asserted-by":"crossref","first-page":"121670","DOI":"10.52202\/079017-3867","article-title":"Simvg: A simple framework for visual grounding with decoupled multi-modal fusion","volume":"37","author":"Dai","year":"2024","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.patcog.2026.114006_b35","first-page":"139854","article-title":"Oneref: Unified one-tower expression grounding and segmentation with mask referring modeling","volume":"37","author":"Xiao","year":"2024","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.patcog.2026.114006_b36","doi-asserted-by":"crossref","unstructured":"M. Dai, J. Li, J. Zhuang, X. Zhang, W. Yang, Multi-task visual grounding with coarse-to-fine consistency constraints, in: Proceedings of the AAAI Conference on Artificial Intelligence, Vol. 39, 2025, pp. 2618\u20132626.","DOI":"10.1609\/aaai.v39i3.32265"},{"key":"10.1016\/j.patcog.2026.114006_b37","doi-asserted-by":"crossref","unstructured":"S. Yu, J. Hong, J. Lee, J. Son, Latent Expression Generation for Referring Image Segmentation and Grounding, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2025, pp. 21374\u201321383.","DOI":"10.1109\/ICCV51701.2025.01985"},{"key":"10.1016\/j.patcog.2026.114006_b38","doi-asserted-by":"crossref","unstructured":"S. Ouyang, H. Wang, S. Xie, Z. Niu, R. Tong, Y.-W. Chen, L. Lin, Slvit: Scale-wise language-guided vision transformer for referring image segmentation, in: Proceedings of the Thirty-Second International Joint Conference on Artificial Intelligence, IJCAI-23, pp. 1294\u20131302.","DOI":"10.24963\/ijcai.2023\/144"},{"key":"10.1016\/j.patcog.2026.114006_b39","article-title":"Multi-modal mutual attention and iterative interaction for referring image segmentation","author":"Liu","year":"2023","journal-title":"IEEE Trans. Image Process."},{"key":"10.1016\/j.patcog.2026.114006_b40","doi-asserted-by":"crossref","unstructured":"Z. Yang, J. Wang, Y. Tang, K. Chen, H. Zhao, P.H. Torr, Semantics-aware dynamic localization and refinement for referring image segmentation, in: Proceedings of the AAAI Conference on Artificial Intelligence, Vol. 37, 2023, pp. 3222\u20133230.","DOI":"10.1609\/aaai.v37i3.25428"},{"key":"10.1016\/j.patcog.2026.114006_b41","doi-asserted-by":"crossref","unstructured":"L. Xu, M.H. Huang, X. Shang, Z. Yuan, Y. Sun, J. Liu, Meta compositional referring expression segmentation, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2023, pp. 19478\u201319487.","DOI":"10.1109\/CVPR52729.2023.01866"},{"key":"10.1016\/j.patcog.2026.114006_b42","doi-asserted-by":"crossref","unstructured":"Z. Xu, Z. Chen, Y. Zhang, Y. Song, X. Wan, G. Li, Bridging vision and language encoders: Parameter-efficient tuning for referring image segmentation, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2023, pp. 17503\u201317512.","DOI":"10.1109\/ICCV51070.2023.01605"},{"key":"10.1016\/j.patcog.2026.114006_b43","doi-asserted-by":"crossref","unstructured":"Y. Hu, Q. Wang, W. Shao, E. Xie, Z. Li, J. Han, P. Luo, Beyond One-to-One: Rethinking the Referring Image Segmentation, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2023, pp. 4067\u20134077.","DOI":"10.1109\/ICCV51070.2023.00376"},{"key":"10.1016\/j.patcog.2026.114006_b44","doi-asserted-by":"crossref","first-page":"1782","DOI":"10.1109\/TIP.2024.3371348","article-title":"Toward robust referring image segmentation","volume":"33","author":"Wu","year":"2024","journal-title":"IEEE Trans. Image Process."},{"key":"10.1016\/j.patcog.2026.114006_b45","doi-asserted-by":"crossref","unstructured":"Z. Liu, Y. Lin, Y. Cao, H. Hu, Y. Wei, Z. Zhang, S. Lin, B. Guo, Swin transformer: Hierarchical vision transformer using shifted windows, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2021, pp. 10012\u201310022.","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"10.1016\/j.patcog.2026.114006_b46","doi-asserted-by":"crossref","unstructured":"T. Wolf, L. Debut, V. Sanh, J. Chaumond, C. Delangue, A. Moi, P. Cistac, T. Rault, R. Louf, M. Funtowicz, et al., Transformers: State-of-the-art natural language processing, in: Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations, 2020, pp. 38\u201345.","DOI":"10.18653\/v1\/2020.emnlp-demos.6"}],"container-title":["Pattern Recognition"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0031320326009714?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0031320326009714?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,6,8]],"date-time":"2026-06-08T15:01:43Z","timestamp":1780930903000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0031320326009714"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,12]]},"references-count":46,"alternative-id":["S0031320326009714"],"URL":"https:\/\/doi.org\/10.1016\/j.patcog.2026.114006","relation":{},"ISSN":["0031-3203"],"issn-type":[{"value":"0031-3203","type":"print"}],"subject":[],"published":{"date-parts":[[2026,12]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Bidirectional token-masking autoencoder for Referring Image Segmentation","name":"articletitle","label":"Article Title"},{"value":"Pattern Recognition","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.patcog.2026.114006","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier Ltd. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"114006"}}