{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,2]],"date-time":"2026-06-02T12:05:42Z","timestamp":1780401942280,"version":"3.54.1"},"reference-count":34,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100015258","name":"Zhejiang Provincial People's Hospital","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100015258","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100004543","name":"China Scholarship Council","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100004543","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Displays"],"published-print":{"date-parts":[[2026,12]]},"DOI":"10.1016\/j.displa.2026.103556","type":"journal-article","created":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T15:56:25Z","timestamp":1780329385000},"page":"103556","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["CLEVR-Remv: A synthetic multimodal dataset and multi-view benchmark for Referring Expression Segmentation"],"prefix":"10.1016","volume":"95","author":[{"given":"Jianan","family":"Chen","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Lu","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Qiong","family":"Wang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Cong","family":"Bai","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Kidiyo","family":"Kpalma","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"78","reference":[{"key":"10.1016\/j.displa.2026.103556_b1","series-title":"2016 IEEE Conference on Computer Vision and Pattern Recognition","first-page":"11","article-title":"Generation and comprehension of unambiguous object descriptions","author":"Mao","year":"2016"},{"key":"10.1016\/j.displa.2026.103556_b2","series-title":"Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing","first-page":"787","article-title":"ReferItGame: Referring to objects in photographs of natural scenes","author":"Kazemzadeh","year":"2014"},{"key":"10.1016\/j.displa.2026.103556_b3","series-title":"Modeling context in referring expressions","author":"Yu","year":"2016"},{"key":"10.1016\/j.displa.2026.103556_b4","unstructured":"B.O. Community, Blender - a 3D modelling and rendering package. URL: http:\/\/www.blender.org."},{"issue":"12","key":"10.1016\/j.displa.2026.103556_b5","doi-asserted-by":"crossref","first-page":"8927","DOI":"10.1109\/TPAMI.2021.3126648","article-title":"Fine-grained image analysis with deep learning: A survey","volume":"44","author":"Wei","year":"2022","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.displa.2026.103556_b6","first-page":"1","article-title":"Referring image segmentation with fine-grained semantic funneling infusion","author":"Yang","year":"2023","journal-title":"IEEE Trans. Neural Netw. Learn. Syst."},{"key":"10.1016\/j.displa.2026.103556_b7","series-title":"ECCV","article-title":"Microsoft COCO: Common objects in context","author":"Lin","year":"2014"},{"key":"10.1016\/j.displa.2026.103556_b8","series-title":"2017 IEEE Conference on Computer Vision and Pattern Recognition","first-page":"1988","article-title":"CLEVR: A diagnostic dataset for compositional language and elementary visual reasoning","author":"Johnson","year":"2017"},{"key":"10.1016\/j.displa.2026.103556_b9","doi-asserted-by":"crossref","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","article-title":"Long short-term memory","volume":"9","author":"Hochreiter","year":"1997","journal-title":"Neural Comput."},{"key":"10.1016\/j.displa.2026.103556_b10","series-title":"2019 IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"4180","article-title":"CLEVR-Ref+: Diagnosing visual reasoning with referring expressions","author":"Liu","year":"2019"},{"key":"10.1016\/j.displa.2026.103556_b11","series-title":"Proceedings of the Neural Information Processing Systems Track on Datasets and Benchmarks","article-title":"ClevrTex: A texture-rich benchmark for unsupervised multi-object segmentation","author":"Karazija","year":"2021"},{"key":"10.1016\/j.displa.2026.103556_b12","series-title":"XxAI - beyond Explainable AI: International Workshop, Held in Conjunction with ICML 2020, July 18, 2020, Vienna, Austria, Revised and Extended Papers","first-page":"69","article-title":"CLEVR-X: A visual reasoning dataset for natural language explanations","author":"Salewski","year":"2022"},{"key":"10.1016\/j.displa.2026.103556_b13","series-title":"2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"10083","article-title":"Cops-ref: A new dataset and task on compositional referring expression comprehension","author":"Chen","year":"2020"},{"key":"10.1016\/j.displa.2026.103556_b14","series-title":"2022 IEEE International Conference on Image Processing","first-page":"1166","article-title":"MVMO: A multi-object dataset for wide baseline multi-view semantic segmentation","author":"Alvarez-Gila","year":"2022"},{"key":"10.1016\/j.displa.2026.103556_b15","doi-asserted-by":"crossref","first-page":"7880","DOI":"10.1109\/TCSVT.2022.3187664","article-title":"UrbanLF: A comprehensive light field dataset for semantic segmentation of urban scenes","volume":"32","author":"Sheng","year":"2022","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"10.1016\/j.displa.2026.103556_b16","series-title":"ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing","first-page":"1998","article-title":"Manet: Multi-scale aggregated network for light field depth estimation","author":"Li","year":"2020"},{"key":"10.1016\/j.displa.2026.103556_b17","doi-asserted-by":"crossref","unstructured":"C. Liu, Z. Lin, X. Shen, J. Yang, X. Lu, A. Yuille, Recurrent Multimodal Interaction for Referring Image Segmentation, in: Proceedings of the IEEE International Conference on Computer Vision, ICCV, 2017.","DOI":"10.1109\/ICCV.2017.143"},{"key":"10.1016\/j.displa.2026.103556_b18","doi-asserted-by":"crossref","unstructured":"Z. Hu, G. Feng, J. Sun, L. Zhang, H. Lu, Bi-Directional Relationship Inferring Network for Referring Image Segmentation, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR, 2020.","DOI":"10.1109\/CVPR42600.2020.00448"},{"key":"10.1016\/j.displa.2026.103556_b19","doi-asserted-by":"crossref","unstructured":"L. Ye, M. Rochan, Z. Liu, Y. Wang, Cross-Modal Self-Attention Network for Referring Image Segmentation, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR, 2019.","DOI":"10.1109\/CVPR.2019.01075"},{"key":"10.1016\/j.displa.2026.103556_b20","doi-asserted-by":"crossref","unstructured":"R. Li, K. Li, Y.-C. Kuo, M. Shu, X. Qi, X. Shen, J. Jia, Referring Image Segmentation via Recurrent Refinement Networks, in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, CVPR, 2018.","DOI":"10.1109\/CVPR.2018.00602"},{"key":"10.1016\/j.displa.2026.103556_b21","doi-asserted-by":"crossref","unstructured":"S. Huang, T. Hui, S. Liu, G. Li, Y. Wei, J. Han, L. Liu, B. Li, Referring Image Segmentation via Cross-Modal Progressive Comprehension, in: 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR, 2020, pp. 10485\u201310494.","DOI":"10.1109\/CVPR42600.2020.01050"},{"key":"10.1016\/j.displa.2026.103556_b22","doi-asserted-by":"crossref","unstructured":"G. Luo, Y. Zhou, X. Sun, L. Cao, C. Wu, C. Deng, R. Ji, Multi-Task Collaborative Network for Joint Referring Expression Comprehension and Segmentation, in: 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR, 2020, pp. 10031\u201310040.","DOI":"10.1109\/CVPR42600.2020.01005"},{"key":"10.1016\/j.displa.2026.103556_b23","doi-asserted-by":"crossref","unstructured":"H. Ding, C. Liu, S. Wang, X. Jiang, Vision-Language Transformer and Query Generation for Referring Segmentation, in: 2021 IEEE\/CVF International Conference on Computer Vision, ICCV, 2021, pp. 16301\u201316310.","DOI":"10.1109\/ICCV48922.2021.01601"},{"key":"10.1016\/j.displa.2026.103556_b24","doi-asserted-by":"crossref","unstructured":"A. Kamath, M. Singh, Y. LeCun, I. Misra, G. Synnaeve, N. Carion, MDETR - Modulated Detection for End-to-End Multi-Modal Understanding, in: 2021 IEEE\/CVF International Conference on Computer Vision, ICCV, 2021, pp. 1760\u20131770.","DOI":"10.1109\/ICCV48922.2021.00180"},{"key":"10.1016\/j.displa.2026.103556_b25","doi-asserted-by":"crossref","unstructured":"Z. Yang, J. Wang, Y. Tang, K. Chen, H. Zhao, P.H.S. Torr, LAVT: Language-Aware Vision Transformer for Referring Image Segmentation, in: 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR, 2021, pp. 18134\u201318144.","DOI":"10.1109\/CVPR52688.2022.01762"},{"key":"10.1016\/j.displa.2026.103556_b26","doi-asserted-by":"crossref","unstructured":"C. Liu, H. Ding, X. Jiang, GRES: Generalized Referring Expression Segmentation, in: CVPR, 2023.","DOI":"10.1109\/CVPR52729.2023.02259"},{"key":"10.1016\/j.displa.2026.103556_b27","doi-asserted-by":"crossref","unstructured":"S.-A. Liu, Y. Zhang, Z. Qiu, H. Xie, Y. Zhang, T. Yao, CARIS: Context-Aware Referring Image Segmentation, in: Proceedings of the 31st ACM International Conference on Multimedia, 2023.","DOI":"10.1145\/3581783.3612117"},{"key":"10.1016\/j.displa.2026.103556_b28","article-title":"Towards robust referring image segmentation","author":"Wu","year":"2024","journal-title":"IEEE-TIP"},{"key":"10.1016\/j.displa.2026.103556_b29","series-title":"TensorFlow: Large-scale machine learning on heterogeneous systems","author":"Abadi","year":"2015"},{"key":"10.1016\/j.displa.2026.103556_b30","series-title":"Advances in Neural Information Processing Systems 32","first-page":"8024","article-title":"PyTorch: An imperative style, high-performance deep learning library","author":"Paszke","year":"2019"},{"key":"10.1016\/j.displa.2026.103556_b31","unstructured":"P. Kr\u00e4henb\u00fchl, V. Koltun, Efficient Inference in Fully Connected CRFs with Gaussian Edge Potentials, in: NIPS, 2011."},{"key":"10.1016\/j.displa.2026.103556_b32","unstructured":"A. Radford, J.W. Kim, C. Hallacy, A. Ramesh, G. Goh, S. Agarwal, G. Sastry, A. Askell, P. Mishkin, J. Clark, G. Krueger, I. Sutskever, Learning Transferable Visual Models From Natural Language Supervision, in: International Conference on Machine Learning, 2021."},{"key":"10.1016\/j.displa.2026.103556_b33","doi-asserted-by":"crossref","unstructured":"K. He, X. Zhang, S. Ren, J. Sun, Deep Residual Learning for Image Recognition, in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, CVPR, 2016.","DOI":"10.1109\/CVPR.2016.90"},{"key":"10.1016\/j.displa.2026.103556_b34","doi-asserted-by":"crossref","unstructured":"Y. Yang, C. Ma, J. Yao, Z. Zhong, Y. Zhang, Y. Wang, ReMamber: Referring Image Segmentation with Mamba Twister, in: European Conference on Computer Vision, ECCV, 2024.","DOI":"10.1007\/978-3-031-72684-2_7"}],"container-title":["Displays"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0141938226002192?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0141938226002192?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,6,2]],"date-time":"2026-06-02T11:24:10Z","timestamp":1780399450000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0141938226002192"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,12]]},"references-count":34,"alternative-id":["S0141938226002192"],"URL":"https:\/\/doi.org\/10.1016\/j.displa.2026.103556","relation":{},"ISSN":["0141-9382"],"issn-type":[{"value":"0141-9382","type":"print"}],"subject":[],"published":{"date-parts":[[2026,12]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"CLEVR-Remv: A synthetic multimodal dataset and multi-view benchmark for Referring Expression Segmentation","name":"articletitle","label":"Article Title"},{"value":"Displays","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.displa.2026.103556","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier B.V. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"103556"}}