{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,24]],"date-time":"2026-01-24T02:35:06Z","timestamp":1769222106824,"version":"3.49.0"},"reference-count":42,"publisher":"Springer Science and Business Media LLC","issue":"2","license":[{"start":{"date-parts":[[2026,1,23]],"date-time":"2026-01-23T00:00:00Z","timestamp":1769126400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,1,23]],"date-time":"2026-01-23T00:00:00Z","timestamp":1769126400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["J Supercomput"],"DOI":"10.1007\/s11227-025-08216-4","type":"journal-article","created":{"date-parts":[[2026,1,23]],"date-time":"2026-01-23T14:32:30Z","timestamp":1769178750000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["VLCA-MFF: enhanced referring image segmentation via visual-linguistic co-attention and multilevel feature fusion"],"prefix":"10.1007","volume":"82","author":[{"given":"Lixia","family":"Ji","sequence":"first","affiliation":[]},{"given":"Yunlong","family":"Du","sequence":"additional","affiliation":[]},{"given":"Zhengjie","family":"Gong","sequence":"additional","affiliation":[]},{"given":"Ziliang","family":"Feng","sequence":"additional","affiliation":[]},{"given":"Han","family":"Zhang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2026,1,23]]},"reference":[{"key":"8216_CR1","doi-asserted-by":"crossref","unstructured":"Patashnik O, Wu Z, Shechtman E, Cohen-Or D, Lischinski D (2021) Styleclip: Text-driven manipulation of stylegan imagery. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 2085\u20132094","DOI":"10.1109\/ICCV48922.2021.00209"},{"key":"8216_CR2","doi-asserted-by":"crossref","unstructured":"Toromanoff M, Wirbel E, Moutarde F (2020) End-to-end model-free reinforcement learning for urban driving using implicit affordances. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 7153\u20137162","DOI":"10.1109\/CVPR42600.2020.00718"},{"key":"8216_CR3","unstructured":"Pate S, Xu W, Yang Z, Love M, Ganguri S, Wong LL (2021) Natural language for human-robot collaboration: Problems beyond language grounding. arXiv preprint arXiv:2110.04441"},{"key":"8216_CR4","doi-asserted-by":"crossref","unstructured":"Chng YX, Zheng H, Han Y, Qiu X, Huang G (2023) Mask grounding for referring image segmentation. arXiv preprint arXiv:2312.12198","DOI":"10.1109\/CVPR52733.2024.02509"},{"key":"8216_CR5","doi-asserted-by":"crossref","unstructured":"Hu R, Rohrbach M, Darrell T (2016) Segmentation from natural language expressions. In: Computer Vision\u2013ECCV 2016: 14th European Conference, Amsterdam, The Netherlands, October 11\u201314, 2016, Proceedings, Part I 14, pp. 108\u2013124. Springer","DOI":"10.1007\/978-3-319-46448-0_7"},{"key":"8216_CR6","doi-asserted-by":"crossref","unstructured":"Liu C, Lin Z, Shen X, Yang J, Lu X, Yuille A (2017) Recurrent multimodal interaction for referring image segmentation. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 1271\u20131280","DOI":"10.1109\/ICCV.2017.143"},{"key":"8216_CR7","doi-asserted-by":"crossref","unstructured":"Liu C, Lin Z, Shen X, Yang J, Lu X, Yuille A (2017) Recurrent multimodal interaction for referring image segmentation. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 1271\u20131280","DOI":"10.1109\/ICCV.2017.143"},{"key":"8216_CR8","doi-asserted-by":"crossref","unstructured":"Li R, Li K, Kuo Y-C, Shu M, Qi X, Shen X, Jia J (2018) Referring image segmentation via recurrent refinement networks. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 5745\u20135753","DOI":"10.1109\/CVPR.2018.00602"},{"issue":"11","key":"8216_CR9","doi-asserted-by":"publisher","first-page":"2278","DOI":"10.1109\/5.726791","volume":"86","author":"Y LeCun","year":"1998","unstructured":"LeCun Y, Bottou L, Bengio Y, Haffner P (1998) Gradient-based learning applied to document recognition. Proc IEEE 86(11):2278\u20132324","journal-title":"Proc IEEE"},{"key":"8216_CR10","doi-asserted-by":"crossref","unstructured":"Huang S, Hui T, Liu S, Li G, Wei Y, Han J, Liu L, Li B (2020) Referring image segmentation via cross-modal progressive comprehension. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10488\u201310497","DOI":"10.1109\/CVPR42600.2020.01050"},{"key":"8216_CR11","doi-asserted-by":"crossref","unstructured":"Hu Z, Feng G, Sun J, Zhang L, Lu H (2020) Bi-directional relationship inferring network for referring image segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4424\u20134433","DOI":"10.1109\/CVPR42600.2020.00448"},{"key":"8216_CR12","unstructured":"Vaswani A, Shazeer N, Parmar N, Uszkoreit J, Jones L, Gomez AN, Kaiser \u0141, Polosukhin I (2017) Attention is all you need. Adv Neural Inf Process Syst30"},{"key":"8216_CR13","doi-asserted-by":"crossref","unstructured":"Yang Z, Wang J, Tang Y, Chen K, Zhao H, Torr PH (2022) Lavt: Language-aware vision transformer for referring image segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18155\u201318165","DOI":"10.1109\/CVPR52688.2022.01762"},{"key":"8216_CR14","doi-asserted-by":"crossref","unstructured":"Ding H, Liu C, Wang S, Jiang X (2021) Vision-language transformer and query generation for referring segmentation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 16321\u201316330","DOI":"10.1109\/ICCV48922.2021.01601"},{"key":"8216_CR15","unstructured":"Cao J, Dai B, Li Y, Qin X, Wang J (2024) Collaborative position reasoning network for referring image segmentation. arXiv preprint arXiv:2401.11775"},{"key":"8216_CR16","doi-asserted-by":"crossref","unstructured":"Liu C, Ding H, Zhang Y, Jiang X (2023) Multi-modal mutual attention and iterative interaction for referring image segmentation. IEEE Transactions on Image Processing","DOI":"10.1109\/TIP.2023.3277791"},{"key":"8216_CR17","first-page":"14729","volume":"35","author":"Z Zhang","year":"2022","unstructured":"Zhang Z, Zhu Y, Liu J, Liang X, Ke W (2022) Coupalign: coupling word-pixel with sentence-mask alignments for referring image segmentation. Adv Neural Inf Process Syst 35:14729\u201314742","journal-title":"Adv Neural Inf Process Syst"},{"key":"8216_CR18","doi-asserted-by":"crossref","unstructured":"Wei Z, Chen X, Chen M, Zhu S (2023) Linguistic query-guided mask generation for referring image segmentation. arXiv preprint arXiv:2301.06429","DOI":"10.2139\/ssrn.4687384"},{"key":"8216_CR19","doi-asserted-by":"crossref","unstructured":"Liu Y, Ge P, Ma H, Fan S, Liu Q, Huang D, Wang Y (2023) Towards generalizable referring image segmentation via target prompt and visual coherence. arXiv preprint arXiv:2312.00452","DOI":"10.1109\/ICIP51287.2024.10647728"},{"key":"8216_CR20","doi-asserted-by":"crossref","unstructured":"Luo G, Zhou Y, Sun X, Cao L, Wu C, Deng C, Ji R (2020) Multi-task collaborative network for joint referring expression comprehension and segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10034\u201310043","DOI":"10.1109\/CVPR42600.2020.01005"},{"key":"8216_CR21","unstructured":"Zhu X, Su W, Lu L, Li B, Wang X, Dai J (2020) Deformable detr: Deformable transformers for end-to-end object detection. arXiv preprint arXiv:2010.04159"},{"key":"8216_CR22","doi-asserted-by":"crossref","unstructured":"Kamath A, Singh M, LeCun Y, Synnaeve G, Misra I, Carion N (2021) Mdetr-modulated detection for end-to-end multi-modal understanding. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 1780\u20131790","DOI":"10.1109\/ICCV48922.2021.00180"},{"key":"8216_CR23","unstructured":"Dosovitskiy A, Beyer L, Kolesnikov A, Weissenborn D, Zhai X, Unterthiner T, Dehghani M, Minderer M, Heigold G, Gelly S et al. (2020) An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929"},{"key":"8216_CR24","doi-asserted-by":"crossref","unstructured":"Liu Z, Lin Y, Cao Y, Hu H, Wei Y, Zhang Z, Lin S, Guo B (2021) Swin transformer: Hierarchical vision transformer using shifted windows. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 10012\u201310022","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"8216_CR25","doi-asserted-by":"crossref","unstructured":"Wang W, Xie E, Li X, Fan D-P, Song K, Liang D, Lu T, Luo P, Shao L (2021) Pyramid vision transformer: A versatile backbone for dense prediction without convolutions. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 568\u2013578","DOI":"10.1109\/ICCV48922.2021.00061"},{"key":"8216_CR26","doi-asserted-by":"crossref","unstructured":"Fan H, Xiong B, Mangalam K, Li Y, Yan Z, Malik J, Feichtenhofer C (2021) Multiscale vision transformers. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 6824\u20136835","DOI":"10.1109\/ICCV48922.2021.00675"},{"key":"8216_CR27","doi-asserted-by":"crossref","unstructured":"Li Y, Wu C-Y, Fan H, Mangalam K, Xiong B, Malik J, Feichtenhofer C (2022) Mvitv2: Improved multiscale vision transformers for classification and detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4804\u20134814","DOI":"10.1109\/CVPR52688.2022.00476"},{"key":"8216_CR28","doi-asserted-by":"crossref","unstructured":"Wu Y-H, Liu Y, Zhan X, Cheng M-M (2022) P2t: Pyramid pooling transformer for scene understanding. IEEE transactions on pattern analysis and machine intelligence","DOI":"10.1109\/TPAMI.2022.3202765"},{"key":"8216_CR29","doi-asserted-by":"crossref","unstructured":"Peng Y, Xia F, Zhang C, Mao J (2024) Deformation feature extraction and double attention feature pyramid network for bearing surface defects detection. IEEE Transactions on Industrial Informatics","DOI":"10.1109\/TII.2024.3370330"},{"key":"8216_CR30","unstructured":"Devlin J, Chang M-W, Lee K, Toutanova K (2018) Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805"},{"key":"8216_CR31","unstructured":"Wu Y, Schuster M, Chen Z, Le QV, Norouzi M, Macherey W, Krikun M, Cao Y, Gao Q, Macherey K et al. (2016) Google\u2019s neural machine translation system: Bridging the gap between human and machine translation. arXiv preprint arXiv:1609.08144"},{"key":"8216_CR32","unstructured":"Liu S, Huang D, Wang Y (2019) Learning spatial fusion for single-shot object detection. arXiv preprint arXiv:1911.09516"},{"key":"8216_CR33","doi-asserted-by":"crossref","unstructured":"Wang G, Wang K, Lin L (2019) Adaptively connected neural networks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1781\u20131790","DOI":"10.1109\/CVPR.2019.00188"},{"key":"8216_CR34","doi-asserted-by":"crossref","unstructured":"Yu L, Poirson P, Yang S, Berg AC, Berg TL (2016) Modeling context in referring expressions. In: Computer Vision\u2013ECCV 2016: 14th European Conference, Amsterdam, The Netherlands, October 11-14, 2016, Proceedings, Part II 14, pp. 69\u201385. Springer","DOI":"10.1007\/978-3-319-46475-6_5"},{"key":"8216_CR35","doi-asserted-by":"crossref","unstructured":"Mao J, Huang J, Toshev A, Camburu O, Yuille AL, Murphy K (2016) Generation and comprehension of unambiguous object descriptions. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 11\u201320","DOI":"10.1109\/CVPR.2016.9"},{"key":"8216_CR36","doi-asserted-by":"crossref","unstructured":"Nagaraja VK, Morariu VI, Davis LS (2016) Modeling context between objects for referring expression understanding. In: Computer Vision\u2013ECCV 2016: 14th European Conference, Amsterdam, The Netherlands, October 11\u201314, 2016, Proceedings, Part IV 14, pp. 792\u2013807 . Springer","DOI":"10.1007\/978-3-319-46493-0_48"},{"key":"8216_CR37","doi-asserted-by":"crossref","unstructured":"Wolf T, Debut L, Sanh V, Chaumond J, Delangue C, Moi A, Cistac P, Rault T, Louf R, Funtowicz M et al. (2020) Transformers: State-of-the-art natural language processing. In: Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations, pp. 38\u201345","DOI":"10.18653\/v1\/2020.emnlp-demos.6"},{"key":"8216_CR38","doi-asserted-by":"crossref","unstructured":"Yang S, Xia M, Li G, Zhou H-Y, Yu Y (2021) Bottom-up shift and reasoning for referring image segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 11266\u201311275","DOI":"10.1109\/CVPR46437.2021.01111"},{"key":"8216_CR39","doi-asserted-by":"crossref","unstructured":"Luo G, Zhou Y, Ji R, Sun X, Su J, Lin C-W, Tian Q (2020) Cascade grouped attention network for referring expression segmentation. In: Proceedings of the 28th ACM International Conference on Multimedia, pp. 1274\u20131282","DOI":"10.1145\/3394171.3414006"},{"key":"8216_CR40","doi-asserted-by":"crossref","unstructured":"Jing Y, Kong T, Wang W, Wang L., Li L, Tan T (2021) Locate then segment: A strong pipeline for referring image segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9858\u20139867","DOI":"10.1109\/CVPR46437.2021.00973"},{"key":"8216_CR41","doi-asserted-by":"crossref","unstructured":"Wang Z, Lu Y, Li Q, Tao X, Guo Y, Gong M, Liu T (2022) Cris: Clip-driven referring image segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 11686\u201311695","DOI":"10.1109\/CVPR52688.2022.01139"},{"key":"8216_CR42","doi-asserted-by":"crossref","unstructured":"Liu C, Ding H, Jiang X (2023) Gres: Generalized referring expression segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 23592\u201323601","DOI":"10.1109\/CVPR52729.2023.02259"}],"container-title":["The Journal of Supercomputing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11227-025-08216-4.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11227-025-08216-4","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11227-025-08216-4.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,23]],"date-time":"2026-01-23T14:32:43Z","timestamp":1769178763000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11227-025-08216-4"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,1,23]]},"references-count":42,"journal-issue":{"issue":"2","published-online":{"date-parts":[[2026,1]]}},"alternative-id":["8216"],"URL":"https:\/\/doi.org\/10.1007\/s11227-025-08216-4","relation":{},"ISSN":["1573-0484"],"issn-type":[{"value":"1573-0484","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,1,23]]},"assertion":[{"value":"10 July 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"30 December 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"23 January 2026","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}],"article-number":"92"}}