{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T02:00:17Z","timestamp":1777600817101,"version":"3.51.4"},"reference-count":58,"publisher":"Springer Science and Business Media LLC","issue":"12","license":[{"start":{"date-parts":[[2025,5,27]],"date-time":"2025-05-27T00:00:00Z","timestamp":1748304000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,5,27]],"date-time":"2025-05-27T00:00:00Z","timestamp":1748304000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62076246"],"award-info":[{"award-number":["62076246"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62076246"],"award-info":[{"award-number":["62076246"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Vis Comput"],"published-print":{"date-parts":[[2025,9]]},"DOI":"10.1007\/s00371-025-03931-8","type":"journal-article","created":{"date-parts":[[2025,5,27]],"date-time":"2025-05-27T06:40:44Z","timestamp":1748328044000},"page":"9355-9372","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":3,"title":["Fine-grained text-based person re-identification via interlaced cross-attention and LoRA fine-tuning"],"prefix":"10.1007","volume":"41","author":[{"given":"Mengnan","family":"Hu","sequence":"first","affiliation":[]},{"given":"Wenjing","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Qianli","family":"Zhou","sequence":"additional","affiliation":[]},{"given":"Rong","family":"Wang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,5,27]]},"reference":[{"key":"3931_CR1","doi-asserted-by":"publisher","unstructured":"Zheng, Z., Zheng, L., Garrett, M. et al. Dual-path convolutional image-text embedding. ACM Transactions on Multimedia Computing, Communications, and Applications, 16 (2017). https:\/\/doi.org\/10.1145\/3383184.","DOI":"10.1145\/3383184"},{"key":"3931_CR2","doi-asserted-by":"publisher","unstructured":"Sarafianos, N., Xu, X., Kakadiaris, I.: adversarial representation learning for text-to-image matching. In: Proceedings of IEEE\/CVF International Conference on Computer Vision (ICCV), pp.5813\u20135823 (2019). https:\/\/doi.org\/10.1109\/ICCV.2019.00591.","DOI":"10.1109\/ICCV.2019.00591"},{"key":"3931_CR3","doi-asserted-by":"publisher","unstructured":"Li, S., Xiao, T., Li, H., et al.: Person search with natural language description. In: Proceedings of IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp.5187\u20135196 (2017). https:\/\/doi.org\/10.1109\/CVPR.2017.551.","DOI":"10.1109\/CVPR.2017.551"},{"key":"3931_CR4","doi-asserted-by":"publisher","unstructured":"Chen, T., Xu, C., Luo, J.: Improving text-based person search by spatial matching and adaptive threshold. In: Proceedings of IEEE Winter Conference on Applications of Computer Vision, pp.1879\u20131887 (2018). https:\/\/doi.org\/10.1109\/WACV.2018.00208.","DOI":"10.1109\/WACV.2018.00208"},{"key":"3931_CR5","doi-asserted-by":"publisher","first-page":"11189","DOI":"10.1609\/aaai.v34i07.6777","volume":"34","author":"Y Jing","year":"2020","unstructured":"Jing, Y., Si, C., Wang, J., et al.: Pose-guided multi-granularity attention network for text-based person search. Proceed. AAAI Conf. Artif. Intell. 34, 11189\u201311196 (2020). https:\/\/doi.org\/10.1609\/aaai.v34i07.6777","journal-title":"Proceed. AAAI Conf. Artif. Intell."},{"key":"3931_CR6","doi-asserted-by":"publisher","unstructured":"Wang, Z., Fang, Z., Wang, J. et al. ViTAA: Visual-textual attributes alignment in person search by natural language. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 402\u2013420 (2020). https:\/\/doi.org\/10.1007\/978-3-030-58610-2_24.","DOI":"10.1007\/978-3-030-58610-2_24"},{"key":"3931_CR7","doi-asserted-by":"publisher","unstructured":"Shao, Z., Zhang, X., Fang, M. et al: Learning granularity-unified representations for text-to-image person re-identification. In: Proceedings of the 30th ACM International Conference on Multimedia, pp.5566\u20135574 (2022). https:\/\/doi.org\/10.1145\/3503161.3548028.","DOI":"10.1145\/3503161.3548028"},{"key":"3931_CR8","doi-asserted-by":"publisher","first-page":"217","DOI":"10.1109\/TMM.2021.3050082","volume":"24","author":"X Gong","year":"2022","unstructured":"Gong, X., Yao, Z., Li, X., et al.: LAG-Net: multi-granularity network for person re-identification via local attention system. IEEE Trans. Multimedia 24, 217\u2013229 (2022). https:\/\/doi.org\/10.1109\/TMM.2021.3050082","journal-title":"IEEE Trans. Multimedia"},{"key":"3931_CR9","doi-asserted-by":"publisher","unstructured":"He, K., Zhang, X., Ren, S. et al.: Deep residual learning for image recognition. In: Proceedings of IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp.770\u2013778 (2016). https:\/\/doi.org\/10.1109\/CVPR.2016.90.","DOI":"10.1109\/CVPR.2016.90"},{"key":"3931_CR10","doi-asserted-by":"publisher","unstructured":"Dosovitskiy, A., Beyer, L., Kolesnikov, A. et al.: An image is worth 16x16 words: transformers for image recognition at scale. arXiv (2020). https:\/\/doi.org\/10.48550\/arXiv.2010.11929.","DOI":"10.48550\/arXiv.2010.11929"},{"key":"3931_CR11","doi-asserted-by":"publisher","unstructured":"Devlin, J., Chang, M-W., Lee, K. et al.: BERT: Pre-training of deep bidirectional transformers for language understanding. In: Proceedings of North American Chapter of the Association for Computational Linguistics, vol.1, pp.2 (2019). https:\/\/doi.org\/10.48550\/arXiv.1810.04805.","DOI":"10.48550\/arXiv.1810.04805"},{"key":"3931_CR12","doi-asserted-by":"publisher","unstructured":"Radford, A., Kim, J. W., Hallacy, C., et al.: Learning transferable visual models from natural language supervision. In: Proceedings of International Conference on Machine Learning (PMLR), pp.8748\u20138763 (2021). https:\/\/doi.org\/10.48550\/arXiv.2103.00020.","DOI":"10.48550\/arXiv.2103.00020"},{"key":"3931_CR13","doi-asserted-by":"publisher","unstructured":"Li, J., Li, D., Xiong, C., et al.: Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In: Proceedings of International conference on machine learning (PMLR), pp.12888\u201312900 (2022). https:\/\/doi.org\/10.48550\/arXiv.2201.12086.","DOI":"10.48550\/arXiv.2201.12086"},{"key":"3931_CR14","doi-asserted-by":"publisher","unstructured":"Li, J., Selvaraju, R., Gotmare, A. et al.: Align before fuse: Vision and language representation learning with momentum distillation. Advances in neural information processing systems, vol. 34, pp. 9694\u20139705 (2021). https:\/\/doi.org\/10.48550\/arXiv.2107.07651.","DOI":"10.48550\/arXiv.2107.07651"},{"key":"3931_CR15","doi-asserted-by":"publisher","unstructured":"Jiang, D., Ye, M.: Cross-modal implicit relation reasoning and aligning for text-to-image person retrieval. In: Proceedings of IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp.2787\u20132797 (2023). https:\/\/doi.org\/10.1109\/CVPR52729.2023.00273.","DOI":"10.1109\/CVPR52729.2023.00273"},{"key":"3931_CR16","doi-asserted-by":"publisher","first-page":"6609","DOI":"10.1109\/TMM.2024.3355644","volume":"26","author":"D Lin","year":"2024","unstructured":"Lin, D., Peng, Y., Meng, J., et al.: Cross-modal adaptive dual association for text-to-image person retrieval. IEEE Trans. Multimedia 26, 6609\u20136620 (2024). https:\/\/doi.org\/10.1109\/TMM.2024.3355644","journal-title":"IEEE Trans. Multimedia"},{"key":"3931_CR17","doi-asserted-by":"publisher","unstructured":"Bai, Y., Cao, M-M., Gao, D. et al.: RaSa: relation and sensitivity aware representation learning for text-based person search. In: Proceedings of the Thirty-Second International Joint Conference on Artificial Intelligence, vol.62, pp.555\u2013563 (2023). https:\/\/doi.org\/10.24963\/ijcai.2023\/62.","DOI":"10.24963\/ijcai.2023\/62"},{"key":"3931_CR18","doi-asserted-by":"publisher","first-page":"94","DOI":"10.1007\/s00530-024-01286-z","volume":"30","author":"K Cheng","year":"2024","unstructured":"Cheng, K., Geng, Q., Huang, S., et al.: Learning shared features from specific and ambiguous descriptions for text-based person search. Multimedia Syst. 30, 94 (2024). https:\/\/doi.org\/10.1007\/s00530-024-01286-z","journal-title":"Multimedia Syst."},{"key":"3931_CR19","doi-asserted-by":"publisher","first-page":"177","DOI":"10.1007\/s00530-024-01372-2","volume":"30","author":"Z Li","year":"2024","unstructured":"Li, Z., Xie, Y.: BCRA: bidirectional cross-modal implicit relation reasoning and aligning for text-to-image person retrieval. Multimedia Syst. 30, 177 (2024). https:\/\/doi.org\/10.1007\/s00530-024-01372-2","journal-title":"Multimedia Syst."},{"key":"3931_CR20","doi-asserted-by":"publisher","unstructured":"Hu, E. J., Shen, Y., Wallis, P. et al.: Lora: Low-rank Adaptation of Large Language Models . arXiv (2021). https:\/\/doi.org\/10.48550\/arXiv.2106.09685.","DOI":"10.48550\/arXiv.2106.09685"},{"key":"3931_CR21","doi-asserted-by":"publisher","unstructured":"Ding, Z., Ding, C., Shao, Z. et al.: Semantically self-aligned network for text-to-image part-aware person re-identification. arXiv (2021). https:\/\/doi.org\/10.48550\/arXiv.2107.12666.","DOI":"10.48550\/arXiv.2107.12666"},{"key":"3931_CR22","doi-asserted-by":"publisher","unstructured":"Zhu, A., Wang, Z., Li, Y., et al.: DSSL: Deep surroundings-person separation learning for text-based person retrieval. In: Proceedings of the 29th ACM International Conference on Multimedia, pp.209\u2013217 (2021). https:\/\/doi.org\/10.1145\/3474085.347536","DOI":"10.1145\/3474085.347536"},{"key":"3931_CR23","doi-asserted-by":"publisher","unstructured":"Li, S., Xiao, T., Li, H., et al.: Identity-aware textual-visual matching with latent co-attention. In: Proceedings of IEEE International Conference on Computer Vision (ICCV), pp.1908\u20131917 (2017). https:\/\/doi.org\/10.1109\/ICCV.2017.209.","DOI":"10.1109\/ICCV.2017.209"},{"key":"3931_CR24","doi-asserted-by":"publisher","unstructured":"Simonyan, K., Zisserman, A.: Very deep convolutional networks for large-scale image recognition. arXiv (2014). https:\/\/doi.org\/10.48550\/arXiv.1409.1556.","DOI":"10.48550\/arXiv.1409.1556"},{"issue":"8","key":"3931_CR25","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter, S., Schmidhuber, J.: Long short-term memory. Neural Comput. 9(8), 1735\u20131780 (1997). https:\/\/doi.org\/10.1162\/neco.1997.9.8.1735","journal-title":"Neural Comput."},{"key":"3931_CR26","doi-asserted-by":"publisher","first-page":"4057","DOI":"10.1109\/TIP.2021.3068825","volume":"30","author":"Y Chen","year":"2021","unstructured":"Chen, Y., Huang, R., Chang, H., et al.: Cross-modal knowledge adaptation for language-based person search. IEEE Trans. Image Process. 30, 4057\u20134069 (2021). https:\/\/doi.org\/10.1109\/TIP.2021.3068825","journal-title":"IEEE Trans. Image Process."},{"key":"3931_CR27","doi-asserted-by":"publisher","unstructured":"Zhang, Y., Lu, H. Deep cross-modal projection learning for image-text matching. In: Proceedings of the European Conference on Computer Vision (ECCV), pp.707\u2013723 (2018). https:\/\/doi.org\/10.1007\/978-3-030-01246-5_42.","DOI":"10.1007\/978-3-030-01246-5_42"},{"key":"3931_CR28","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3383184","volume":"16","author":"Z Zheng","year":"2020","unstructured":"Zheng, Z., Zheng, L., Garrett, M., et al.: Dual-path convolutional image-text embeddings with instance loss. ACM Trans. Multimed. Comput. Commun. Appl. 16, 1\u201323 (2020). https:\/\/doi.org\/10.1145\/3383184","journal-title":"ACM Trans. Multimed. Comput. Commun. Appl."},{"key":"3931_CR29","doi-asserted-by":"publisher","unstructured":"Shu, X., Wen, W., Wu, H. et al. See Finer, see more: implicit modality alignment for text-based person retrieval. In: Proceedings of the European Conference on Computer Vision (ECCV) Workshops, pp. 624\u2013641 (2022). https:\/\/doi.org\/10.1007\/978-3-031-25072-9_42.","DOI":"10.1007\/978-3-031-25072-9_42"},{"key":"3931_CR30","doi-asserted-by":"publisher","unstructured":"Chen, Y., Zhang, G., Lu, Y. et al. TIPCB: A simple but effective part-based convolutional baseline for text-based person search. Neurocomputing, vlo. 494, pp.171\u2013181 (2021). https:\/\/doi.org\/10.1016\/j.neucom.2022.04.081.","DOI":"10.1016\/j.neucom.2022.04.081"},{"key":"3931_CR31","doi-asserted-by":"publisher","unstructured":"Chowdhury, S., Soni, B.: Beyond words: Esc-net revolutionizes vqa by elevating visual features and defying language priors. Computational Intelligence, vol.40 (2024). https:\/\/doi.org\/10.1111\/coin.70010.","DOI":"10.1111\/coin.70010"},{"key":"3931_CR32","doi-asserted-by":"publisher","first-page":"129906","DOI":"10.1016\/j.neucom.2025.129906","volume":"635","author":"S Chowdhury","year":"2025","unstructured":"Chowdhury, S., Soni, B.: Handling language prior and compositional reasoning issues in visual question answering system. Neurocomputing 635, 129906 (2025). https:\/\/doi.org\/10.1016\/j.neucom.2025.129906","journal-title":"Neurocomputing"},{"key":"3931_CR33","doi-asserted-by":"publisher","unstructured":"Farooq, A., Awais, M., Kittler, J. et al.: AXM-Net: Implicit cross-modal feature alignment for person re-identification. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 36(4), pp.4477\u20134485 (2021). https:\/\/doi.org\/10.1609\/aaai.v36i4.20370.","DOI":"10.1609\/aaai.v36i4.20370"},{"issue":"12","key":"3931_CR34","doi-asserted-by":"publisher","first-page":"8825","DOI":"10.1007\/s00371-024-03274-w","volume":"40","author":"F Wu","year":"2024","unstructured":"Wu, F., Wang, Q., Wang, Z., et al.: (2024) ITContrast: contrastive learning with hard negative synthesis for image-text matching. Vis. Comput. 40(12), 8825\u20138838 (2024). https:\/\/doi.org\/10.1007\/s00371-024-03274-w","journal-title":"Vis. Comput."},{"key":"3931_CR35","unstructured":"Vaswani, A., Shazeer, N., Parmar, N. et al.: Attention is all you need. In: Proceedings of the 31st International Conference on Neural Information Processing Systems, pp.6000\u20136010 (2022)."},{"key":"3931_CR36","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1109\/TGRS.2024.3422007","volume":"62","author":"Z Li","year":"2024","unstructured":"Li, Z., Hu, J., Wu, K., et al.: Adjacent-atrous mechanism for expanding global receptive fields: an end-to-end network for multiattribute scene analysis in remote sensing imagery. IEEE Trans. Geosci. Remote Sens. 62, 1\u201319 (2024). https:\/\/doi.org\/10.1109\/TGRS.2024.3422007","journal-title":"IEEE Trans. Geosci. Remote Sens."},{"key":"3931_CR37","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1109\/TGRS.2024.3516501","volume":"63","author":"Z Li","year":"2025","unstructured":"Li, Z., Hu, J., Wu, K., et al.: Comprehensive attribute difference attention network for remote sensing image semantic understanding. IEEE Trans. Geosci. Remote Sens. 63, 1\u201316 (2025). https:\/\/doi.org\/10.1109\/TGRS.2024.3516501","journal-title":"IEEE Trans. Geosci. Remote Sens."},{"issue":"6","key":"3931_CR38","doi-asserted-by":"publisher","first-page":"509","DOI":"10.1016\/j.vrih.2023.06.003","volume":"5","author":"M Wang","year":"2023","unstructured":"Wang, M., Meng, M., Liu, J., et al.: Learning adequate alignment and interaction for cross-modal retrieval. Virtual Reality Intell. Hardware. 5(6), 509\u2013522 (2023). https:\/\/doi.org\/10.1016\/j.vrih.2023.06.003","journal-title":"Virtual Reality Intell. Hardware."},{"key":"3931_CR39","doi-asserted-by":"publisher","unstructured":"Ma, Y., Wang, M., Lu, G., et al. Multi-label semantic sharing based on graph convolutional network for image-to-text retrieval. The Visual Computer, (2024). https:\/\/doi.org\/10.1007\/s00371-024-03496-y.","DOI":"10.1007\/s00371-024-03496-y"},{"key":"3931_CR40","doi-asserted-by":"publisher","first-page":"112827","DOI":"10.1016\/j.knosys.2024.112827","volume":"309","author":"S Chowdhury","year":"2025","unstructured":"Chowdhury, S., Soni, B.: R-vqa: A robust visual question answering model. Knowl.-Based Syst. 309, 112827 (2025). https:\/\/doi.org\/10.1016\/j.knosys.2024.112827","journal-title":"Knowl.-Based Syst."},{"key":"3931_CR41","doi-asserted-by":"publisher","first-page":"109948","DOI":"10.1016\/j.engappai.2024.109948","volume":"142","author":"S Chowdhury","year":"2025","unstructured":"Chowdhury, S., Soni, B.: Envqa: improving visual question answering model by enriching the visual feature. Eng. Appl. Artif. Intell. 142, 109948 (2025). https:\/\/doi.org\/10.1016\/j.engappai.2024.109948","journal-title":"Eng. Appl. Artif. Intell."},{"key":"3931_CR42","doi-asserted-by":"publisher","unstructured":"Chen, Y-C., Li, L., Yu, L. et al. UNITER: UNiversal image-TExt representation learning. In: Proceedings of the European Conference on Computer Vision (ECCV), pp.104\u2013120 (2020). https:\/\/doi.org\/10.1007\/978-3-030-58577-8_7","DOI":"10.1007\/978-3-030-58577-8_7"},{"key":"3931_CR43","doi-asserted-by":"publisher","first-page":"10479","DOI":"10.1007\/s13369-023-07661-8","volume":"48","author":"S Chowdhury","year":"2023","unstructured":"Chowdhury, S.: Qsfvqa: A time efficient, scalable and optimized vqa framework. Arab. J. Sci. Eng. 48, 10479\u201310491 (2023). https:\/\/doi.org\/10.1007\/s13369-023-07661-8","journal-title":"Arab. J. Sci. Eng."},{"key":"3931_CR44","doi-asserted-by":"publisher","unstructured":"Han, X., He, S., Zhang, L. et al.: Text-based person search with limited data. In: British Machine Vision Conference (2021). https:\/\/doi.org\/10.48550\/arXiv.2110.10807.","DOI":"10.48550\/arXiv.2110.10807"},{"key":"3931_CR45","doi-asserted-by":"publisher","first-page":"6032","DOI":"10.1109\/TIP.2023.3327924","volume":"32","author":"S Yan","year":"2022","unstructured":"Yan, S., Dong, N., Zhang, L., et al.: CLIP-driven fine-grained text-image person re-identification. IEEE Trans. Image Process. 32, 6032\u20136046 (2022). https:\/\/doi.org\/10.1109\/TIP.2023.3327924","journal-title":"IEEE Trans. Image Process."},{"key":"3931_CR46","doi-asserted-by":"publisher","unstructured":"Wang, G., Yu, F., Li, J. et al.: Exploiting the textual potential from vision-language pre-training for text-based person search. arXiv. (2023) https:\/\/doi.org\/10.48550\/arXiv.2303.04497.","DOI":"10.48550\/arXiv.2303.04497"},{"key":"3931_CR47","doi-asserted-by":"crossref","unstructured":"Sennrich, R., Haddow, B., Birch, A. J., A, P. A.: Neural machine translation of rare words with subword units. In: Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics, vol.1, pp.1715\u20131725 (2016)","DOI":"10.18653\/v1\/P16-1162"},{"key":"3931_CR48","doi-asserted-by":"crossref","unstructured":"Aghajanyan, A., Gupta, S., Zettlemoyer, L. Intrinsic dimensionality explains the effectiveness of language model fine-tuning. In: Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing, vol.1,pp. 7319\u20137328 (2021).","DOI":"10.18653\/v1\/2021.acl-long.568"},{"key":"3931_CR49","doi-asserted-by":"publisher","unstructured":"Fu, Z., Zhou, W., Xu, J. et al.: Contextual representation learning beyond masked language modeling. In: Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics, vol.1, pp. 2701\u20132714 (2022). https:\/\/doi.org\/10.18653\/v1\/2022.acl-long.193.","DOI":"10.18653\/v1\/2022.acl-long.193"},{"issue":"6","key":"3931_CR50","doi-asserted-by":"publisher","first-page":"2872","DOI":"10.1109\/TPAMI.2021.3054775","volume":"44","author":"M Ye","year":"2022","unstructured":"Ye, M., Shen, J., Lin, G., et al.: Deep learning for person re-identification: a survey and outlook. IEEE Trans. Pattern Anal. Mach. Intell. 44(6), 2872\u20132893 (2022). https:\/\/doi.org\/10.1109\/TPAMI.2021.3054775","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"3931_CR51","doi-asserted-by":"publisher","unstructured":"Kingma, D. P., Ba, J. J.: Adam: A method for stochastic optimization. In: International Conference on Learning Representations (2014). https:\/\/doi.org\/10.48550\/arXiv.1412.6980","DOI":"10.48550\/arXiv.1412.6980"},{"key":"3931_CR52","doi-asserted-by":"publisher","unstructured":"Cao, M., Bai, Y., Zeng, Z. et al. An empirical study of clip for text-based person search. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 38(1), pp.465\u2013473 (2024). https:\/\/doi.org\/10.1609\/aaai.v38i1.27801.","DOI":"10.1609\/aaai.v38i1.27801"},{"key":"3931_CR53","doi-asserted-by":"publisher","unstructured":"Wang, Z., Zhu, A., Xue, J. et al.: CAIBC: Capturing all-round information beyond color for text-based person retrieval. In: Proceedings of the 30th ACM International Conference on Multimedia, pp.5314\u20135322 (2022). https:\/\/doi.org\/10.1145\/3503161.3548057.","DOI":"10.1145\/3503161.3548057"},{"key":"3931_CR54","doi-asserted-by":"publisher","unstructured":"Yang, S., Zhou, Y., Zheng, Z. et al. Towards Unified text-based person retrieval: a large-scale multi-attribute and language search benchmark. In: Proceedings of the 31st ACM International Conference on Multimedia, pp.4492\u20134501 (2023). https:\/\/doi.org\/10.1145\/3581783.3611709.","DOI":"10.1145\/3581783.3611709"},{"key":"3931_CR55","doi-asserted-by":"publisher","unstructured":"Shao, Z., Zhang, X., Ding, C. et al.: Unified pre-training with pseudo texts for text-to-image person re-identification. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp.11140\u201311150 (2023). https:\/\/doi.org\/10.1109\/ICCV51070.2023.01026.","DOI":"10.1109\/ICCV51070.2023.01026"},{"key":"3931_CR56","doi-asserted-by":"publisher","unstructured":"Chen, Z., Duan, Y., Wang, W., et al.: Vision transformer adapter for dense predictions (2022). https:\/\/doi.org\/10.48550\/arXiv.2205.08534.","DOI":"10.48550\/arXiv.2205.08534"},{"key":"3931_CR57","doi-asserted-by":"publisher","unstructured":"Dou, Z. Y., Xu, Y., Gan, Z., et al.: An empirical study of training end-to-end vision-and-language transformers. Proceedings of the 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp.18145\u201318155 (2022). https:\/\/doi.org\/10.1109\/CVPR52688.2022.01763.","DOI":"10.1109\/CVPR52688.2022.01763"},{"key":"3931_CR58","doi-asserted-by":"publisher","first-page":"570","DOI":"10.1162\/tacl_a_00385","volume":"9","author":"LA Hendricks","year":"2021","unstructured":"Hendricks, L.A., Mellor, J.F.J., Schneider, R., et al.: Decoupling the role of data, attention, and losses in multimodal transformers. Trans. Assoc. Comput. Linguistics 9, 570\u2013585 (2021). https:\/\/doi.org\/10.1162\/tacl_a_00385","journal-title":"Trans. Assoc. Comput. Linguistics"}],"container-title":["The Visual Computer"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00371-025-03931-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00371-025-03931-8\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00371-025-03931-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,15]],"date-time":"2025-09-15T09:36:06Z","timestamp":1757928966000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00371-025-03931-8"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,5,27]]},"references-count":58,"journal-issue":{"issue":"12","published-print":{"date-parts":[[2025,9]]}},"alternative-id":["3931"],"URL":"https:\/\/doi.org\/10.1007\/s00371-025-03931-8","relation":{},"ISSN":["0178-2789","1432-2315"],"issn-type":[{"value":"0178-2789","type":"print"},{"value":"1432-2315","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,5,27]]},"assertion":[{"value":"21 April 2025","order":1,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"27 May 2025","order":2,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that they have no known competing financial interests or personal relationships that could have appeared to influence the work reported in this paper.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}