{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,16]],"date-time":"2026-03-16T12:06:48Z","timestamp":1773662808177,"version":"3.50.1"},"reference-count":67,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2026,1,16]],"date-time":"2026-01-16T00:00:00Z","timestamp":1768521600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,1,16]],"date-time":"2026-01-16T00:00:00Z","timestamp":1768521600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100004735","name":"Natural Science Foundation of Hunan Province","doi-asserted-by":"publisher","award":["2023JJ30082"],"award-info":[{"award-number":["2023JJ30082"]}],"id":[{"id":"10.13039\/501100004735","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Multimed Info Retr"],"published-print":{"date-parts":[[2026,3]]},"DOI":"10.1007\/s13735-025-00393-5","type":"journal-article","created":{"date-parts":[[2026,1,16]],"date-time":"2026-01-16T10:44:45Z","timestamp":1768560285000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["TFANet: three-stage image-text feature alignment network for robust referring image segmentation"],"prefix":"10.1007","volume":"15","author":[{"given":"Qianqi","family":"Lu","sequence":"first","affiliation":[]},{"given":"Yuxiang","family":"Xie","sequence":"additional","affiliation":[]},{"given":"Jing","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Shiwei","family":"Zou","sequence":"additional","affiliation":[]},{"given":"Yan","family":"Chen","sequence":"additional","affiliation":[]},{"given":"Xidao","family":"Luan","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2026,1,16]]},"reference":[{"key":"393_CR1","doi-asserted-by":"crossref","unstructured":"Cheng MM, Zheng S, Lin WY, et\u00a0al (2014) Imagespirit: Verbal guided image parsing. ACM Transactions on Graphics","DOI":"10.1145\/2682628"},{"key":"393_CR2","doi-asserted-by":"crossref","unstructured":"Chen J, Shen Y, Gao J, et\u00a0al (2018) Language-based image editing with recurrent attentive models. In: 2018 IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 8721\u20138729","DOI":"10.1109\/CVPR.2018.00909"},{"key":"393_CR3","doi-asserted-by":"crossref","unstructured":"Patashnik O, Wu Z, Shechtman E, et\u00a0al (2021) Styleclip: Text-driven manipulation of stylegan imagery. In: 2021 IEEE\/CVF International Conference on Computer Vision (ICCV), pp 2065\u20132074","DOI":"10.1109\/ICCV48922.2021.00209"},{"issue":"4","key":"393_CR4","doi-asserted-by":"publisher","first-page":"3308","DOI":"10.1109\/LRA.2018.2852786","volume":"3","author":"H Ahn","year":"2018","unstructured":"Ahn H, Choi S, Kim N et al (2018) Interactive text2pickup networks for natural language-based human\u2013robot collaboration. IEEE Robotics and Automation Letters 3(4):3308\u20133315","journal-title":"IEEE Robotics and Automation Letters"},{"key":"393_CR5","doi-asserted-by":"publisher","first-page":"6622","DOI":"10.1109\/CVPR.2019.00679","volume-title":"2019 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","author":"X Wang","year":"2019","unstructured":"Wang X, Huang Q, Celikyilmaz A et al (2019) Reinforced Cross-Modal Matching and Self-Supervised Imitation Learning for Vision-Language Navigation. 2019 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). IEEE Computer Society, Los Alamitos, CA, USA, pp 6622\u20136631"},{"issue":"2","key":"393_CR6","doi-asserted-by":"publisher","first-page":"386","DOI":"10.1109\/TPAMI.2018.2844175","volume":"42","author":"K He","year":"2020","unstructured":"He K, Gkioxari G, Doll\u00e1r P et al (2020) Mask r-cnn. IEEE Trans Pattern Anal Mach Intell 42(2):386\u2013397","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"393_CR7","unstructured":"Xie E, Wang W, Yu Z, et\u00a0al (2021) Segformer: simple and efficient design for semantic segmentation with transformers. In: Proceedings of the 35th International Conference on Neural Information Processing Systems. Curran Associates Inc., Red Hook, NY, USA, NIPS \u201921"},{"key":"393_CR8","doi-asserted-by":"crossref","unstructured":"Zhang X, Xu C, Fan G, et\u00a0al (2025) Fscmf: A dual-branch frequency-spatial joint perception cross-modality network for visible and infrared image fusion. Neurocomput 641(C)","DOI":"10.1016\/j.neucom.2025.130376"},{"key":"393_CR9","unstructured":"Wang C, Pan J, Wang W, et\u00a0al (2023) Promptrestorer: A prompting image restoration method with degradation perception. In: NeurIPS"},{"issue":"2","key":"393_CR10","doi-asserted-by":"publisher","first-page":"2026","DOI":"10.1109\/TCSS.2023.3270164","volume":"11","author":"M Jian","year":"2024","unstructured":"Jian M, Lu X, Yu X et al (2024) Flow-edge-net: Video saliency detection based on optical flow and edge-weighted balance loss. IEEE Transactions on Computational Social Systems 11(2):2026\u20132035","journal-title":"IEEE Transactions on Computational Social Systems"},{"key":"393_CR11","doi-asserted-by":"crossref","unstructured":"Ding H, Liu C, Wang S, et\u00a0al (2021) Vision-language transformer and query generation for referring segmentation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp 16321\u201316330","DOI":"10.1109\/ICCV48922.2021.01601"},{"key":"393_CR12","doi-asserted-by":"crossref","unstructured":"Kamath A, Singh M, LeCun Y, et\u00a0al (2021) Mdetr - modulated detection for end-to-end multi-modal understanding. In: 2021 IEEE\/CVF International Conference on Computer Vision (ICCV), pp 1760\u20131770","DOI":"10.1109\/ICCV48922.2021.00180"},{"key":"393_CR13","doi-asserted-by":"crossref","unstructured":"Kim N, Kim D, Kwak S, et\u00a0al (2022) Restr: Convolution-free referring image segmentation using transformers. In: 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp 18124\u201318133","DOI":"10.1109\/CVPR52688.2022.01761"},{"key":"393_CR14","doi-asserted-by":"crossref","unstructured":"Ma C, Yuhuan Y, Ju C, et\u00a0al (2023) Attrseg: Open-vocabulary semantic segmentation via attribute decomposition-aggregation. In: Oh A, Naumann T, Globerson A, et\u00a0al (eds) Advances in Neural Information Processing Systems, vol\u00a036. Curran Associates, Inc., pp 10258\u201310270","DOI":"10.52202\/075280-0450"},{"key":"393_CR15","doi-asserted-by":"crossref","unstructured":"Yang Z, Wang J, Tang Y, et\u00a0al (2022) Lavt: Language-aware vision transformer for referring image segmentation. In: 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp 18134\u201318144","DOI":"10.1109\/CVPR52688.2022.01762"},{"key":"393_CR16","doi-asserted-by":"crossref","unstructured":"Ji L, Du Y, Dang Y, et\u00a0al (2024) A survey of methods for addressing the challenges of referring image segmentation. Neurocomput 583(C)","DOI":"10.1016\/j.neucom.2024.127599"},{"key":"393_CR17","doi-asserted-by":"publisher","first-page":"69","DOI":"10.1007\/978-3-319-46475-6_5","volume-title":"Computer Vision - ECCV 2016","author":"L Yu","year":"2016","unstructured":"Yu L, Poirson P, Yang S et al (2016) Modeling context in referring expressions. In: Leibe B, Matas J, Sebe N et al (eds) Computer Vision - ECCV 2016. Springer International Publishing, Cham, pp 69\u201385"},{"key":"393_CR18","doi-asserted-by":"crossref","unstructured":"Kazemzadeh S, Ordonez V, Matten M, et\u00a0al (2014) ReferItGame: Referring to objects in photographs of natural scenes. In: Moschitti A, Pang B, Daelemans W (eds) Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP). Association for Computational Linguistics, Doha, Qatar, pp 787\u2013798","DOI":"10.3115\/v1\/D14-1086"},{"key":"393_CR19","doi-asserted-by":"crossref","unstructured":"Mao J, Huang J, Toshev A, et\u00a0al (2016) Generation and comprehension of unambiguous object descriptions. In: 2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp 11\u201320","DOI":"10.1109\/CVPR.2016.9"},{"key":"393_CR20","doi-asserted-by":"publisher","first-page":"108","DOI":"10.1007\/978-3-319-46448-0_7","volume-title":"Computer Vision - ECCV 2016","author":"R Hu","year":"2016","unstructured":"Hu R, Rohrbach M, Darrell T (2016) Segmentation from natural language expressions. In: Leibe B, Matas J, Sebe N et al (eds) Computer Vision - ECCV 2016. Springer International Publishing, Cham, pp 108\u2013124"},{"key":"393_CR21","doi-asserted-by":"crossref","unstructured":"Li R, Li K, Kuo YC, et\u00a0al (2018) Referring image segmentation via recurrent refinement networks. In: 2018 IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 5745\u20135753","DOI":"10.1109\/CVPR.2018.00602"},{"key":"393_CR22","doi-asserted-by":"crossref","unstructured":"Liu C, Lin Z, Shen X, et\u00a0al (2017) Recurrent multimodal interaction for referring image segmentation. In: 2017 IEEE International Conference on Computer Vision (ICCV), pp 1280\u20131289","DOI":"10.1109\/ICCV.2017.143"},{"issue":"11","key":"393_CR23","doi-asserted-by":"publisher","first-page":"7436","DOI":"10.1109\/TPAMI.2021.3117837","volume":"44","author":"Y Han","year":"2022","unstructured":"Han Y, Huang G, Song S et al (2022) Dynamic neural networks: A survey. IEEE Trans Pattern Anal Mach Intell 44(11):7436\u20137456","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"393_CR24","doi-asserted-by":"crossref","unstructured":"Feng G, Hu Z, Zhang L, et\u00a0al (2021) Encoder fusion network with co-attention embedding for referring image segmentation. In: 2021 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp 15501\u201315510","DOI":"10.1109\/CVPR46437.2021.01525"},{"key":"393_CR25","doi-asserted-by":"crossref","unstructured":"Wang Z, Lu Y, Li Q, et\u00a0al (2022) Cris: Clip-driven referring image segmentation. In: 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp 11676\u201311685","DOI":"10.1109\/CVPR52688.2022.01139"},{"key":"393_CR26","doi-asserted-by":"crossref","unstructured":"Liu Z, Lin Y, Cao Y, et\u00a0al (2021) Swin transformer: Hierarchical vision transformer using shifted windows. In: 2021 IEEE\/CVF International Conference on Computer Vision (ICCV), pp 9992\u201310002","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"393_CR27","doi-asserted-by":"crossref","unstructured":"Liu J, Ding H, Cai Z, et\u00a0al (2023) Polyformer: Referring image segmentation as sequential polygon generation. In: 2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp 18653\u201318663","DOI":"10.1109\/CVPR52729.2023.01789"},{"key":"393_CR28","doi-asserted-by":"publisher","first-page":"598","DOI":"10.1007\/978-3-031-19833-5_35","volume-title":"Computer Vision - ECCV 2022","author":"C Zhu","year":"2022","unstructured":"Zhu C, Zhou Y, Shen Y et al (2022) Seqtr: A simple yet universal network for visual grounding. In: Avidan S, Brostow G, Ciss\u00e9 M et al (eds) Computer Vision - ECCV 2022. Springer Nature Switzerland, Cham, pp 598\u2013615"},{"key":"393_CR29","doi-asserted-by":"crossref","unstructured":"Liu C, Ding H, Jiang X (2023) Gres: Generalized referring expression segmentation. In: 2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp 23592\u201323601","DOI":"10.1109\/CVPR52729.2023.02259"},{"key":"393_CR30","doi-asserted-by":"crossref","unstructured":"Tang J, Zheng G, Shi C, et\u00a0al (2023) Contrastive grouping with transformer for referring image segmentation. 2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) pp 23570\u201323580","DOI":"10.1109\/CVPR52729.2023.02257"},{"key":"393_CR31","doi-asserted-by":"publisher","first-page":"5823","DOI":"10.1109\/TMM.2023.3340062","volume":"26","author":"Y Cho","year":"2024","unstructured":"Cho Y, Yu H, Kang SJ (2024) Cross-aware early fusion with stage-divided vision and language transformer encoders for referring image segmentation. IEEE Trans Multimedia 26:5823\u20135833","journal-title":"IEEE Trans Multimedia"},{"key":"393_CR32","doi-asserted-by":"crossref","unstructured":"Jiang X, Yang H, Zhu K, et\u00a0al (2025) Ptq4ris: Post-training quantization for referring image segmentation. International Conference on Robotics and Automation","DOI":"10.1109\/ICRA55743.2025.11128841"},{"key":"393_CR33","doi-asserted-by":"crossref","unstructured":"Huang J, Xu Z, Liu T, et\u00a0al (2025) Densely connected parameter-efficient tuning for referring image segmentation. In: Proceedings of the AAAI Conference on Artificial Intelligence, pp 3653\u20133661","DOI":"10.1609\/aaai.v39i4.32380"},{"key":"393_CR34","first-page":"108","volume-title":"Computer Vision - ECCV 2024","author":"Y Yang","year":"2025","unstructured":"Yang Y, Ma C, Yao J et al (2025) Remamber: Referring image segmentation with mamba twister. In: Leonardis A, Ricci E, Roth S et al (eds) Computer Vision - ECCV 2024. Springer Nature Switzerland, Cham, pp 108\u2013126"},{"key":"393_CR35","unstructured":"Gu A, Dao T (2023) Mamba: Linear-time sequence modeling with selective state spaces. CoRR abs\/2312.00752. 2312.00752"},{"key":"393_CR36","volume-title":"Advances in Neural Information Processing Systems","author":"A Vaswani","year":"2017","unstructured":"Vaswani A, Shazeer N, Parmar N et al (2017) Attention is all you need. In: Guyon I, Luxburg UV, Bengio S et al (eds) Advances in Neural Information Processing Systems, vol 30. Curran Associates Inc"},{"key":"393_CR37","doi-asserted-by":"crossref","unstructured":"Dai Z, Yang Z, Yang Y, et\u00a0al (2019) Transformer-xl: Attentive language models beyond a fixed-length context. In: Annual Meeting of the Association for Computational Linguistics","DOI":"10.18653\/v1\/P19-1285"},{"key":"393_CR38","unstructured":"Devlin J, Chang MW, Lee K, et\u00a0al (2019) BERT: Pre-training of deep bidirectional transformers for language understanding. In: Burstein J, Doran C, Solorio T (eds) Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers). Association for Computational Linguistics, Minneapolis, Minnesota, pp 4171\u20134186"},{"key":"393_CR39","volume-title":"Advances in Neural Information Processing Systems","author":"Z Yang","year":"2019","unstructured":"Yang Z, Dai Z, Yang Y et al (2019) Xlnet: Generalized autoregressive pretraining for language understanding. In: Wallach H, Larochelle H, Beygelzimer A et al (eds) Advances in Neural Information Processing Systems, vol 32. Curran Associates Inc"},{"key":"393_CR40","unstructured":"Kim W, Son B, Kim I (2021) Vilt: Vision-and-language transformer without convolution or region supervision. In: Meila M, Zhang T (eds) Proceedings of the 38th International Conference on Machine Learning, Proceedings of Machine Learning Research, vol 139. PMLR, pp 5583\u20135594"},{"key":"393_CR41","volume-title":"Advances in Neural Information Processing Systems","author":"J Lu","year":"2019","unstructured":"Lu J, Batra D, Parikh D et al (2019) Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. In: Wallach H, Larochelle H, Beygelzimer A et al (eds) Advances in Neural Information Processing Systems, vol 32. Curran Associates Inc"},{"key":"393_CR42","unstructured":"Ramesh A, Pavlov M, Goh G, et\u00a0al (2021) Zero-shot text-to-image generation. In: Meila M, Zhang T (eds) Proceedings of the 38th International Conference on Machine Learning, Proceedings of Machine Learning Research, vol 139. PMLR, pp 8821\u20138831"},{"key":"393_CR43","unstructured":"Huang Z, Zeng Z, Liu B, et\u00a0al (2020) Pixel-bert: Aligning image pixels with text by deep multi-modal transformers. CoRR abs\/2004.00849. 2004.00849"},{"key":"393_CR44","unstructured":"Chen S, Guhur PL, Schmid C, et\u00a0al (2021) History aware multimodal transformer for vision-and-language navigation. In: Ranzato M, Beygelzimer A, Dauphin Y, et\u00a0al (eds) Advances in Neural Information Processing Systems, vol\u00a034. Curran Associates, Inc., pp 5834\u20135847"},{"key":"393_CR45","unstructured":"Radford A, Kim JW, Hallacy C, et\u00a0al (2021) Learning transferable visual models from natural language supervision. In: Meila M, Zhang T (eds) Proceedings of the 38th International Conference on Machine Learning, Proceedings of Machine Learning Research, vol 139. PMLR, pp 8748\u20138763"},{"key":"393_CR46","doi-asserted-by":"crossref","unstructured":"Hu R, Singh A (2021) Unit: Multimodal multitask learning with a unified transformer. In: 2021 IEEE\/CVF International Conference on Computer Vision (ICCV), pp 1419\u20131429","DOI":"10.1109\/ICCV48922.2021.00147"},{"key":"393_CR47","unstructured":"Zhang Z, Zhu Y, Liu J, et\u00a0al (2022) Coupalign: coupling word-pixel with sentence-mask alignments for referring image segmentation. In: Proceedings of the 36th International Conference on Neural Information Processing Systems. Curran Associates Inc., Red Hook, NY, USA, NIPS \u201922"},{"key":"393_CR48","doi-asserted-by":"crossref","unstructured":"Chng YX, Zheng H, Han Y, et\u00a0al (2024) Mask grounding for referring image segmentation. In: 2024 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp 26563\u201326573","DOI":"10.1109\/CVPR52733.2024.02509"},{"key":"393_CR49","doi-asserted-by":"crossref","unstructured":"Liu Y, Tian Y, Zhao Y, et\u00a0al (2024) Vmamba: Visual state space model. In: Globerson A, Mackey L, Belgrave D, et\u00a0al (eds) Advances in Neural Information Processing Systems, vol\u00a037. Curran Associates, Inc., pp 103031\u2013103063","DOI":"10.52202\/079017-3273"},{"key":"393_CR50","doi-asserted-by":"publisher","first-page":"656","DOI":"10.1007\/978-3-030-01252-6_39","volume-title":"Computer Vision - ECCV 2018","author":"E Margffoy-Tuay","year":"2018","unstructured":"Margffoy-Tuay E, P\u00e9rez JC, Botero E et al (2018) Dynamic multimodal instance segmentation guided by natural language queries. In: Ferrari V, Hebert M, Sminchisescu C et al (eds) Computer Vision - ECCV 2018. Springer International Publishing, Cham, pp 656\u2013672"},{"key":"393_CR51","doi-asserted-by":"crossref","unstructured":"Hu Z, Feng G, Sun J, et\u00a0al (2020) Bi-directional relationship inferring network for referring image segmentation. In: 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp 4423\u20134432","DOI":"10.1109\/CVPR42600.2020.00448"},{"key":"393_CR52","doi-asserted-by":"crossref","unstructured":"Luo G, Zhou Y, Ji R, et\u00a0al (2020) Cascade grouped attention network for referring expression segmentation. In: Proceedings of the 28th ACM International Conference on Multimedia. Association for Computing Machinery, New York, NY, USA, MM \u201920, p 1274\u20131282","DOI":"10.1145\/3394171.3414006"},{"key":"393_CR53","doi-asserted-by":"publisher","first-page":"38","DOI":"10.1007\/978-3-030-01231-1_3","volume-title":"Computer Vision - ECCV 2018","author":"H Shi","year":"2018","unstructured":"Shi H, Li H, Meng F et al (2018) Key-word-aware network for referring expression image segmentation. In: Ferrari V, Hebert M, Sminchisescu C et al (eds) Computer Vision - ECCV 2018. Springer International Publishing, Cham, pp 38\u201354"},{"key":"393_CR54","doi-asserted-by":"crossref","unstructured":"Ye L, Rochan M, Liu Z, et\u00a0al (2019) Cross-modal self-attention network for referring image segmentation. In: 2019 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp 10494\u201310503","DOI":"10.1109\/CVPR.2019.01075"},{"issue":"9","key":"393_CR55","first-page":"4761","volume":"44","author":"S Liu","year":"2022","unstructured":"Liu S, Hui T, Huang S et al (2022) Cross-modal progressive comprehension for referring segmentation. IEEE Trans Pattern Anal Mach Intell 44(9):4761\u20134775","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"393_CR56","unstructured":"Song J, Lee HC (2022) X-vit: High performance linear vision transformer without softmax. arXiv:2205.13805"},{"key":"393_CR57","doi-asserted-by":"crossref","unstructured":"Luo G, Zhou Y, Sun X, et\u00a0al (2020) Multi-task collaborative network for joint referring expression comprehension and segmentation. In: 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp 10031\u201310040","DOI":"10.1109\/CVPR42600.2020.01005"},{"key":"393_CR58","doi-asserted-by":"crossref","unstructured":"Jing Y, Kong T, Wang W, et\u00a0al (2021) Locate then segment: A strong pipeline for referring image segmentation. In: 2021 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp 9853\u20139862","DOI":"10.1109\/CVPR46437.2021.00973"},{"key":"393_CR59","doi-asserted-by":"crossref","unstructured":"Yan Y, He X, Chen S, et\u00a0al (2024) Calibration & reconstruction: Deeply integrated language for referring image segmentation. In: Proceedings of the 2024 International Conference on Multimedia Retrieval. Association for Computing Machinery, New York, NY, USA, ICMR \u201924, p 451\u2013459","DOI":"10.1145\/3652583.3658095"},{"key":"393_CR60","doi-asserted-by":"crossref","unstructured":"Huang S, Hui T, Liu S, et\u00a0al (2020) Referring image segmentation via cross-modal progressive comprehension. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","DOI":"10.1109\/CVPR42600.2020.01050"},{"key":"#cr-split#-393_CR61.1","doi-asserted-by":"crossref","unstructured":"Hui T, Liu S, Huang S et al (2020) Linguistic structure guided context modeling for referring image segmentation. In: Part X","DOI":"10.1007\/978-3-030-58607-2_4"},{"key":"#cr-split#-393_CR61.2","unstructured":"(ed) Computer Vision - ECCV 2020: 16th European Conference, Glasgow, UK, August 23-28, 2020, Proceedings. Springer-Verlag, Berlin, Heidelberg, pp 59-75"},{"key":"393_CR62","doi-asserted-by":"crossref","unstructured":"Yang S, Xia M, Li G, et\u00a0al (2021) Bottom-up shift and reasoning for referring image segmentation. In: 2021 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp 11261\u201311270","DOI":"10.1109\/CVPR46437.2021.01111"},{"key":"393_CR63","doi-asserted-by":"crossref","unstructured":"Hu Y, Wang Q, Shao W, et\u00a0al (2023) Beyond one-to-one: Rethinking the referring image segmentation. In: 2023 IEEE\/CVF International Conference on Computer Vision (ICCV), pp 4044\u20134054","DOI":"10.1109\/ICCV51070.2023.00376"},{"key":"393_CR64","doi-asserted-by":"crossref","unstructured":"Yang Z, Wang J, Tang Y, et\u00a0al (2023) Semantics-aware dynamic localization and refinement for referring image segmentation. In: Proceedings of the Thirty-Seventh AAAI Conference on Artificial Intelligence and Thirty-Fifth Conference on Innovative Applications of Artificial Intelligence and Thirteenth Symposium on Educational Advances in Artificial Intelligence. AAAI Press, AAAI\u201923\/IAAI\u201923\/EAAI\u201923","DOI":"10.1609\/aaai.v37i3.25428"},{"key":"393_CR65","doi-asserted-by":"crossref","unstructured":"Yue P, Lin J, Zhang S, et\u00a0al (2024) Adaptive selection based referring image segmentation. In: Proceedings of the 32nd ACM International Conference on Multimedia. Association for Computing Machinery, New York, NY, USA, MM \u201924, p 1101\u20131110","DOI":"10.1145\/3664647.3680850"},{"key":"393_CR66","doi-asserted-by":"publisher","first-page":"740","DOI":"10.1007\/978-3-319-10602-1_48","volume-title":"Computer Vision - ECCV 2014","author":"TY Lin","year":"2014","unstructured":"Lin TY, Maire M, Belongie S et al (2014) Microsoft coco: Common objects in context. In: Fleet D, Pajdla T, Schiele B et al (eds) Computer Vision - ECCV 2014. Springer International Publishing, Cham, pp 740\u2013755"}],"container-title":["International Journal of Multimedia Information Retrieval"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s13735-025-00393-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s13735-025-00393-5","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s13735-025-00393-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,16]],"date-time":"2026-03-16T11:10:42Z","timestamp":1773659442000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s13735-025-00393-5"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,1,16]]},"references-count":67,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2026,3]]}},"alternative-id":["393"],"URL":"https:\/\/doi.org\/10.1007\/s13735-025-00393-5","relation":{},"ISSN":["2192-6611","2192-662X"],"issn-type":[{"value":"2192-6611","type":"print"},{"value":"2192-662X","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,1,16]]},"assertion":[{"value":"30 April 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"29 November 2025","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"12 December 2025","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"16 January 2026","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no competing interests.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Competing interests"}}],"article-number":"6"}}