{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,10]],"date-time":"2025-06-10T14:10:08Z","timestamp":1749564608733,"version":"3.41.0"},"reference-count":44,"publisher":"Springer Science and Business Media LLC","issue":"2","license":[{"start":{"date-parts":[[2025,3,10]],"date-time":"2025-03-10T00:00:00Z","timestamp":1741564800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,3,10]],"date-time":"2025-03-10T00:00:00Z","timestamp":1741564800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Multimed Info Retr"],"published-print":{"date-parts":[[2025,6]]},"DOI":"10.1007\/s13735-025-00356-w","type":"journal-article","created":{"date-parts":[[2025,3,10]],"date-time":"2025-03-10T10:59:24Z","timestamp":1741604364000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Cross-modal alignment with synthetic caption for text-based person search"],"prefix":"10.1007","volume":"14","author":[{"given":"Weichen","family":"Zhao","sequence":"first","affiliation":[]},{"given":"Yuxing","family":"Lu","sequence":"additional","affiliation":[]},{"given":"Zhiyuan","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Yuan","family":"Yang","sequence":"additional","affiliation":[]},{"given":"Ge","family":"Jiao","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,3,10]]},"reference":[{"key":"356_CR1","unstructured":"Han X, He S, Zhang L, Xiang T (2021) Text-based person search with limited data"},{"key":"356_CR2","doi-asserted-by":"crossref","unstructured":"Wang Z, Fang Z, Wang J, Yang Y (2020) Vitaa: Visual-textual attributes alignment in person search by natural language. In: Computer Vision\u2013ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part XII 16, pp 402\u2013420. Springer","DOI":"10.1007\/978-3-030-58610-2_24"},{"key":"356_CR3","doi-asserted-by":"publisher","first-page":"5542","DOI":"10.1109\/TIP.2020.2984883","volume":"29","author":"K Niu","year":"2020","unstructured":"Niu K, Huang Y, Ouyang W, Wang L (2020) Improving description-based person re-identification by multi-granularity image-text alignments. IEEE Trans Image Process 29:5542\u20135556","journal-title":"IEEE Trans Image Process"},{"key":"356_CR4","doi-asserted-by":"publisher","first-page":"7699","DOI":"10.1109\/TMM.2022.3225754","volume":"25","author":"Z Ji","year":"2022","unstructured":"Ji Z, Hu J, Liu D, Wu LY, Zhao Y (2022) Asymmetric cross-scale alignment for text-based person search. IEEE Trans Multimed 25:7699\u20137709","journal-title":"IEEE Trans Multimed"},{"key":"356_CR5","unstructured":"Ding Z, Ding C, Shao Z, Tao D (2021) Semantically self-aligned network for text-to-image part-aware person re-identification. arXiv preprint arXiv:2107.12666"},{"key":"356_CR6","doi-asserted-by":"crossref","unstructured":"Ma Y, Sun X, Ji J, Jiang G, Zhuang W, Ji R (2023) Beat: Bi-directional one-to-many embedding alignment for text-based person retrieval. In: Proceedings of the 31st ACM International Conference on Multimedia, pp 4157\u20134168","DOI":"10.1145\/3581783.3611768"},{"key":"356_CR7","doi-asserted-by":"crossref","unstructured":"Yang K, Deng J, An X, Li J, Feng Z, Guo J, Yang J, Liu T (2023) Alip: Adaptive language-image pre-training with synthetic caption. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp 2922\u20132931","DOI":"10.1109\/ICCV51070.2023.00273"},{"key":"356_CR8","unstructured":"Li J, Li D, Xiong C, Hoi S (2022) Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In: International Conference on Machine Learning, pp 12888\u201312900. PMLR"},{"key":"356_CR9","doi-asserted-by":"crossref","unstructured":"Wang Z, Zhu A, Xue J, Wan X, Liu C, Wang T, Li Y (2022) Caibc: Capturing all-round information beyond color for text-based person retrieval. In: Proceedings of the 30th ACM International Conference on Multimedia, pp 5314\u20135322","DOI":"10.1145\/3503161.3548057"},{"key":"356_CR10","unstructured":"Radford A, Kim JW, Hallacy C, Ramesh A, Goh G, Agarwal S, Sastry G, Askell A, Mishkin P, Clark J, et al (2021) Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp 8748\u20138763. PMLR"},{"key":"356_CR11","first-page":"9694","volume":"34","author":"J Li","year":"2021","unstructured":"Li J, Selvaraju R, Gotmare A, Joty S, Xiong C, Hoi SCH (2021) Align before fuse: vision and language representation learning with momentum distillation. Adv Neural Inform Process Syst 34:9694\u20139705","journal-title":"Adv Neural Inform Process Syst"},{"key":"356_CR12","unstructured":"Li J, Li D, Savarese S, Hoi S (2023) Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In: International Conference on Machine Learning, pp 19730\u201319742. PMLR"},{"key":"356_CR13","doi-asserted-by":"crossref","unstructured":"Li S, Xiao T, Li H, Zhou B, Yue D, Wang X (2017) Person search with natural language description. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp 1970\u20131979","DOI":"10.1109\/CVPR.2017.551"},{"key":"356_CR14","doi-asserted-by":"crossref","unstructured":"Jing Y, Si C, Wang J, Wang W, Wang L, Tan T (2020) Pose-guided multi-granularity attention network for text-based person search. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 34, pp 11189\u201311196","DOI":"10.1609\/aaai.v34i07.6777"},{"key":"356_CR15","doi-asserted-by":"crossref","unstructured":"Bai Y, Cao M, Gao D, Cao Z, Chen C, Fan Z, Nie L, Zhang M (2023) Rasa: Relation and sensitivity aware representation learning for text-based person search. arXiv preprint arXiv:2305.13653","DOI":"10.24963\/ijcai.2023\/62"},{"key":"356_CR16","doi-asserted-by":"crossref","unstructured":"Yang S, Zhou Y, Zheng Z, Wang Y, Zhu L, Wu Y (2023) Towards unified text-based person retrieval: A large-scale multi-attribute and language search benchmark. In: Proceedings of the 31st ACM International Conference on Multimedia, pp 4492\u20134501","DOI":"10.1145\/3581783.3611709"},{"key":"356_CR17","doi-asserted-by":"crossref","unstructured":"Bai Y, Wang J, Cao M, Chen C, Cao Z, Nie L, Zhang M (2023) Text-based person search without parallel image-text data. arXiv preprint arXiv:2305.12964","DOI":"10.1145\/3581783.3612285"},{"key":"356_CR18","doi-asserted-by":"crossref","unstructured":"He K, Fan H, Wu Y, Xie S, Girshick R (2020) Momentum contrast for unsupervised visual representation learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 9729\u20139738","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"356_CR19","unstructured":"Dosovitskiy A, Beyer L, Kolesnikov A, Weissenborn D, Zhai X, Unterthiner T, Dehghani M, Minderer M, Heigold G, Gelly S. et al (2020) An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929"},{"key":"356_CR20","unstructured":"Devlin J, Chang M-W, Lee K, Toutanova K (2018) Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805"},{"key":"356_CR21","doi-asserted-by":"crossref","unstructured":"Ronneberger O, Fischer P, Brox T (2015) U-net: Convolutional networks for biomedical image segmentation. In: Medical Image Computing and Computer-Assisted Intervention\u2013MICCAI 2015: 18th International Conference, Munich, Germany, October 5-9, 2015, Proceedings, Part III 18, pp 234\u2013241. Springer","DOI":"10.1007\/978-3-319-24574-4_28"},{"key":"356_CR22","doi-asserted-by":"publisher","first-page":"6032","DOI":"10.1109\/TIP.2023.3327924","volume":"32","author":"S Yan","year":"2023","unstructured":"Yan S, Dong N, Zhang L, Tang J (2023) Clip-driven fine-grained text-image person re-identification. IEEE Trans Image Process 32:6032\u20136046","journal-title":"IEEE Trans Image Process"},{"key":"356_CR23","unstructured":"Oord Avd, Li Y, Vinyals O (2018) Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748"},{"key":"356_CR24","doi-asserted-by":"crossref","unstructured":"Zhu A, Wang Z, Li Y, Wan X, Jin J, Wang T, Hu F, Hua G (2021) Dssl: Deep surroundings-person separation learning for text-based person retrieval. In: Proceedings of the 29th ACM International Conference on Multimedia, pp 209\u2013217","DOI":"10.1145\/3474085.3475369"},{"key":"356_CR25","doi-asserted-by":"crossref","unstructured":"Wu Y, Yan Z, Han X, Li G, Zou C, Cui S (2021) Lapscore: language-guided person search via color reasoning. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp 1624\u20131633","DOI":"10.1109\/ICCV48922.2021.00165"},{"key":"356_CR26","doi-asserted-by":"crossref","unstructured":"Li S, Cao M, Zhang M (2022) Learning semantic-aligned feature representation for text-based person search. In: ICASSP 2022-2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp 2724\u20132728","DOI":"10.1109\/ICASSP43922.2022.9746846"},{"key":"356_CR27","doi-asserted-by":"crossref","unstructured":"Wang Z, Zhu A, Xue J, Wan X, Liu C, Wang T, Li Y (2022) Look before you leap: Improving text-based person retrieval by learning a consistent cross-modal common manifold. In: Proceedings of the 30th ACM International Conference on Multimedia, pp 1984\u20131992","DOI":"10.1145\/3503161.3548166"},{"key":"356_CR28","doi-asserted-by":"crossref","unstructured":"Shao Z, Zhang X, Fang M, Lin Z, Wang J, Ding C (2022) Learning granularity-unified representations for text-to-image person re-identification. In: Proceedings of the 30th ACM International Conference on Multimedia, pp 5566\u20135574","DOI":"10.1145\/3503161.3548028"},{"key":"356_CR29","doi-asserted-by":"crossref","unstructured":"Farooq A, Awais M, Kittler J, Khalid SS (2022) Axm-net: Implicit cross-modal feature alignment for person re-identification. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 36, pp 4477\u20134485","DOI":"10.1609\/aaai.v36i4.20370"},{"key":"356_CR30","doi-asserted-by":"crossref","unstructured":"Yan S, Dong N, Liu J, Zhang L, Tang J (2023) Learning comprehensive representations with richer self for text-to-image person re-identification. In: Proceedings of the 31st ACM International Conference on Multimedia, pp 6202\u20136211","DOI":"10.1145\/3581783.3611832"},{"key":"356_CR31","doi-asserted-by":"crossref","unstructured":"Shu X, Wen W, Wu H, Chen K, Song Y, Qiao R, Ren B, Wang X (2022) See finer, see more: Implicit modality alignment for text-based person retrieval. In: European Conference on Computer Vision, pp 624\u2013641. Springer","DOI":"10.1007\/978-3-031-25072-9_42"},{"key":"356_CR32","doi-asserted-by":"crossref","unstructured":"Zang X, Gao W, Li G, Fang H, Ban C, He Z, Sun H (2023) A baseline investigation: Transformer-based cross-view baseline for text-based person search. In: Proceedings of the 31st ACM International Conference on Multimedia, pp 7737\u20137746","DOI":"10.1145\/3581783.3611916"},{"key":"356_CR33","doi-asserted-by":"crossref","unstructured":"Jiang D, Ye M (2023) Cross-modal implicit relation reasoning and aligning for text-to-image person retrieval. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 2787\u20132797","DOI":"10.1109\/CVPR52729.2023.00273"},{"key":"356_CR34","unstructured":"Cao M, Bai Y, Zeng Z, Ye M, Zhang M (2023) An empirical study of clip for text-based person search. arXiv preprint arXiv:2308.10045"},{"key":"356_CR35","doi-asserted-by":"crossref","unstructured":"Fujii T, Tarashima S (2023) Bilma: Bidirectional local-matching for text-based person re-identification. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp 2786\u20132790","DOI":"10.1109\/ICCVW60793.2023.00295"},{"key":"356_CR36","doi-asserted-by":"crossref","unstructured":"Qin Y, Chen Y, Peng D, Peng X, Zhou JT, Hu P (2024) Noisy-correspondence learning for text-to-image person re-identification. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 27197\u201327206","DOI":"10.1109\/CVPR52733.2024.02568"},{"key":"356_CR37","doi-asserted-by":"crossref","unstructured":"Yan S, Liu J, Dong N, Zhang L, Tang J (2024) Prototypical prompting for text-to-image person re-identification. In: Proceedings of the 32nd ACM International Conference on Multimedia, pp 2331\u20132340","DOI":"10.1145\/3664647.3681165"},{"key":"356_CR38","doi-asserted-by":"crossref","unstructured":"Li S, He C, Xu X, Shen F, Yang Y, Shen HT (2024) Adaptive uncertainty-based learning for text-based person retrieval. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 38, pp 3172\u20133180","DOI":"10.1609\/aaai.v38i4.28101"},{"issue":"2","key":"356_CR39","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3383184","volume":"16","author":"Z Zheng","year":"2020","unstructured":"Zheng Z, Zheng L, Garrett M, Yang Y, Xu M, Shen Y-D (2020) Dual-path convolutional image-text embeddings with instance loss. ACM Trans Multimed Comput Commun Appl (TOMM) 16(2):1\u201323","journal-title":"ACM Trans Multimed Comput Commun Appl (TOMM)"},{"key":"356_CR40","doi-asserted-by":"crossref","unstructured":"Zhang Y, Lu H (2018) Deep cross-modal projection learning for image-text matching. In: Proceedings of the European Conference on Computer Vision (ECCV), pp 686\u2013701","DOI":"10.1007\/978-3-030-01246-5_42"},{"key":"356_CR41","doi-asserted-by":"publisher","first-page":"171","DOI":"10.1016\/j.neucom.2022.04.081","volume":"494","author":"Y Chen","year":"2022","unstructured":"Chen Y, Zhang G, Lu Y, Wang Z, Zheng Y (2022) Tipcb: a simple but effective part-based convolutional baseline for text-based person search. Neurocomputing 494:171\u2013181","journal-title":"Neurocomputing"},{"key":"356_CR42","doi-asserted-by":"crossref","unstructured":"Suo W, Sun M, Niu K, Gao Y, Wang P, Zhang Y, Wu Q (2022) A simple and robust correlation filtering method for text-based person search. In: European Conference on Computer Vision, pp 726\u2013742. Springer","DOI":"10.1007\/978-3-031-19833-5_42"},{"key":"356_CR43","doi-asserted-by":"crossref","unstructured":"Wang Z, Xue J, Zhu A, Li Y, Zhang M, Zhong C (2021) Amen: Adversarial multi-space embedding network for text-based person re-identification. In: Pattern Recognition and Computer Vision: 4th Chinese Conference, PRCV 2021, Beijing, China, October 29\u2013November 1, 2021, Proceedings, Part II 4, pp 462\u2013473. Springer","DOI":"10.1007\/978-3-030-88007-1_38"},{"key":"356_CR44","doi-asserted-by":"crossref","unstructured":"McInnes L, Healy J, Melville J (2018) Umap: Uniform manifold approximation and projection for dimension reduction. arXiv preprint arXiv:1802.03426","DOI":"10.21105\/joss.00861"}],"container-title":["International Journal of Multimedia Information Retrieval"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s13735-025-00356-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s13735-025-00356-w\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s13735-025-00356-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,10]],"date-time":"2025-06-10T13:40:23Z","timestamp":1749562823000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s13735-025-00356-w"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,3,10]]},"references-count":44,"journal-issue":{"issue":"2","published-print":{"date-parts":[[2025,6]]}},"alternative-id":["356"],"URL":"https:\/\/doi.org\/10.1007\/s13735-025-00356-w","relation":{},"ISSN":["2192-6611","2192-662X"],"issn-type":[{"type":"print","value":"2192-6611"},{"type":"electronic","value":"2192-662X"}],"subject":[],"published":{"date-parts":[[2025,3,10]]},"assertion":[{"value":"28 August 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"30 November 2024","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"12 February 2025","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"10 March 2025","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"All authors certify that they have no affiliations with or involvement in any organization or entity with any financial or non-financial interest in the subject matter or materials discussed in this manuscript.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interests"}}],"article-number":"11"}}