{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,17]],"date-time":"2026-04-17T23:49:09Z","timestamp":1776469749711,"version":"3.51.2"},"reference-count":42,"publisher":"Springer Science and Business Media LLC","issue":"4","license":[{"start":{"date-parts":[[2023,7,6]],"date-time":"2023-07-06T00:00:00Z","timestamp":1688601600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,7,6]],"date-time":"2023-07-06T00:00:00Z","timestamp":1688601600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["Grant No. 61977045"],"award-info":[{"award-number":["Grant No. 61977045"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Mach Learn"],"published-print":{"date-parts":[[2024,4]]},"DOI":"10.1007\/s10994-023-06352-7","type":"journal-article","created":{"date-parts":[[2023,7,6]],"date-time":"2023-07-06T20:23:31Z","timestamp":1688675011000},"page":"1921-1939","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":23,"title":["Deep multimodal representation learning for generalizable person re-identification"],"prefix":"10.1007","volume":"113","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-9141-6460","authenticated-orcid":false,"given":"Suncheng","family":"Xiang","sequence":"first","affiliation":[]},{"given":"Hao","family":"Chen","sequence":"additional","affiliation":[]},{"given":"Wei","family":"Ran","sequence":"additional","affiliation":[]},{"given":"Zefang","family":"Yu","sequence":"additional","affiliation":[]},{"given":"Ting","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Dahong","family":"Qian","sequence":"additional","affiliation":[]},{"given":"Yuzhuo","family":"Fu","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2023,7,6]]},"reference":[{"key":"6352_CR1","unstructured":"Baevski, A., Hsu, W.N., Xu, Q., et\u00a0al. (2022). Data2vec: A general framework for self-supervised learning in speech, vision and language. arXiv:2202.03555"},{"key":"6352_CR2","doi-asserted-by":"crossref","unstructured":"Deng, W., Zheng, L., Ye, Q., et\u00a0al. (2018). Image-image domain adaptation with preserved self-similarity and domain-dissimilarity for person re-identification. In IEEE conference on computer vision and pattern recognition (pp. 994\u20131003).","DOI":"10.1109\/CVPR.2018.00110"},{"key":"6352_CR3","unstructured":"Dosovitskiy, A., Beyer, L., Kolesnikov, A., et\u00a0al. (2020). An image is worth 16x16 words: Transformers for image recognition at scale. arXiv:2010.11929"},{"key":"6352_CR4","unstructured":"Ester, M., Kriegel, H.P., Sander, J., et\u00a0al. (1996). A density-based algorithm for discovering clusters in large spatial databases with noise. In ACM SIGKDD international conference on knowledge discovery and data mining (pp. 226\u2013231)."},{"key":"6352_CR5","unstructured":"Han, K., Wang, Y., Chen, H., et\u00a0al. (2022). A survey on vision transformer. IEEE Transactions on Pattern Analysis and Machine Intelligence."},{"key":"6352_CR6","doi-asserted-by":"crossref","unstructured":"He, S., Luo, H., Wang, P., et\u00a0al. (2021). Transreid: Transformer-based object re-identification. In IEEE international conference on computer vision (pp. 15,013\u201315,022).","DOI":"10.1109\/ICCV48922.2021.01474"},{"key":"6352_CR7","unstructured":"Hermans, A., Beyer, L., & Leibe, B. (2017). In defense of the triplet loss for person re-identification. arXiv:1703.07737."},{"key":"6352_CR8","doi-asserted-by":"crossref","unstructured":"Jia, M., Cheng, X., Lu, S., et\u00a0al. (2022). Learning disentangled representation implicitly via transformer for occluded person re-identification. IEEE Transactions on Multimedia.","DOI":"10.1109\/TMM.2022.3141267"},{"key":"6352_CR9","doi-asserted-by":"crossref","unstructured":"Jin, X., Lan, C., Zeng, W., et\u00a0al. (2020). Style normalization and restitution for generalizable person re-identification. In IEEE conference on computer vision and pattern recognition (pp. 3143\u20133152).","DOI":"10.1109\/CVPR42600.2020.00321"},{"key":"6352_CR10","unstructured":"Kim, W., Son, B., & Kim, I. (2021). Vilt: Vision-and-language transformer without convolution or region supervision. In International conference on machine learning, PMLR (pp. 5583\u20135594)."},{"key":"6352_CR11","doi-asserted-by":"crossref","unstructured":"Li, W., Zhao, R., Xiao, T., et\u00a0al. (2014). Deepreid: Deep filter pairing neural network for person re-identification. In IEEE conference on computer vision and pattern recognition (pp. 152\u2013159).","DOI":"10.1109\/CVPR.2014.27"},{"key":"6352_CR12","doi-asserted-by":"crossref","unstructured":"Liao, S., & Shao, L. (2020). Interpretable and generalizable person re-identification with query-adaptive convolution and temporal lifting. In European conference on computer vision (pp. 456\u2013474). Springer.","DOI":"10.1007\/978-3-030-58621-8_27"},{"key":"6352_CR13","first-page":"1992","volume":"34","author":"S Liao","year":"2021","unstructured":"Liao, S., & Shao, L. (2021). Transmatcher: Deep image matching through transformers for generalizable person re-identification. Advances in Neural Information Processing Systems, 34, 1992\u20132003.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"6352_CR14","doi-asserted-by":"crossref","unstructured":"Liao, S., & Shao, L. (2022). Graph sampling based deep metric learning for generalizable person re-identification. In IEEE conference on computer vision and pattern recognition (pp. 7359\u20137368).","DOI":"10.1109\/CVPR52688.2022.00721"},{"issue":"10","key":"6352_CR15","doi-asserted-by":"publisher","first-page":"2597","DOI":"10.1109\/TMM.2019.2958756","volume":"22","author":"H Luo","year":"2019","unstructured":"Luo, H., Jiang, W., Gu, Y., et al. (2019). A strong baseline and batch normalization neck for deep person re-identification. IEEE Transactions on Multimedia, 22(10), 2597\u20132609.","journal-title":"IEEE Transactions on Multimedia"},{"key":"6352_CR16","doi-asserted-by":"crossref","unstructured":"Muhammad, M. B., & Yeasin, M. (2020). Eigen-cam: Class activation map using principal components. In International joint conference on neural networks (pp. 1\u20137). IEEE.","DOI":"10.1109\/IJCNN48605.2020.9206626"},{"key":"6352_CR17","doi-asserted-by":"crossref","unstructured":"Pei, J., Cheng, T., Tang, H., et\u00a0al. (2022). Transformer-based efficient salient instance segmentation networks with orientative query. IEEE Transactions on Multimedia.","DOI":"10.1109\/TMM.2022.3141891"},{"issue":"2","key":"6352_CR18","doi-asserted-by":"publisher","first-page":"371","DOI":"10.1109\/TPAMI.2019.2928294","volume":"42","author":"X Qian","year":"2019","unstructured":"Qian, X., Fu, Y., Xiang, T., et al. (2019). Leader-based multi-scale attention deep architecture for person re-identification. IEEE Transactions on Pattern Analysis and Machine Intelligence, 42(2), 371\u2013385.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"6352_CR19","doi-asserted-by":"crossref","unstructured":"Ristani, E., Solera, F., Zou, R., et\u00a0al. (2016) Performance measures and a data set for multi-target, multi-camera tracking. In European conference on computer vision (pp. 17\u201335). Springer.","DOI":"10.1007\/978-3-319-48881-3_2"},{"key":"6352_CR20","doi-asserted-by":"crossref","unstructured":"Sun, X., & Zheng, L. (2019). Dissecting person re-identification from the viewpoint of viewpoint. In IEEE conference on computer vision and pattern recognition (pp. 608\u2013617).","DOI":"10.1109\/CVPR.2019.00070"},{"key":"6352_CR21","doi-asserted-by":"crossref","unstructured":"Sun, Y., Zheng, L., Yang, Y., et\u00a0al. (2018). Beyond part models: Person retrieval with refined part pooling (and a strong convolutional baseline). In European conference on computer vision (pp. 480\u2013496).","DOI":"10.1007\/978-3-030-01225-0_30"},{"key":"6352_CR22","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., et\u00a0al. (2017). Attention is all you need. Advances in Neural Information Processing Systems, 30."},{"key":"6352_CR23","doi-asserted-by":"crossref","unstructured":"Wang, G., Yuan, Y., Chen, X., et\u00a0al. (2018). Learning discriminative features with multiple granularities for person re-identification. In ACM international conference on multimedia (pp. 274\u2013282).","DOI":"10.1145\/3240508.3240552"},{"key":"6352_CR24","doi-asserted-by":"crossref","unstructured":"Wang, P., Ding, C., Tan, W., et\u00a0al. (2022a). Uncertainty-aware clustering for unsupervised domain adaptive object re-identification. IEEE Transactions on Multimedia.","DOI":"10.1109\/TMM.2022.3149629"},{"key":"6352_CR25","doi-asserted-by":"crossref","unstructured":"Wang, X., Zhu, L., & Yang, Y. (2021). T2vlad: Global-local sequence alignment for text-video retrieval. In IEEE conference on computer vision and pattern recognition (pp. 5079\u20135088).","DOI":"10.1109\/CVPR46437.2021.00504"},{"key":"6352_CR26","doi-asserted-by":"crossref","unstructured":"Wang, X., Zhu, L., Zheng, Z., et\u00a0al. (2022b). Align and tell: Boosting text-video retrieval with local alignment and fine-grained supervision. IEEE Transactions on Multimedia.","DOI":"10.1109\/TMM.2022.3204444"},{"key":"6352_CR27","doi-asserted-by":"crossref","unstructured":"Wang, Y., Liao, S., & Shao, L. (2020). Surpassing real-world source training data: Random 3d characters for generalizable person re-identification. In ACM international conference on multimedia (pp. 3422\u20133430).","DOI":"10.1145\/3394171.3413815"},{"key":"6352_CR28","doi-asserted-by":"crossref","unstructured":"Wei, L., Zhang, S., Gao, W., et\u00a0al. (2018). Person transfer gan to bridge domain gap for person re-identification. In IEEE conference on computer vision and pattern recognition (pp. 79\u201388).","DOI":"10.1109\/CVPR.2018.00016"},{"key":"6352_CR33","doi-asserted-by":"crossref","unstructured":"Xiang, S., Fu, Y., Guan, M., et\u00a0al. (2022). Learning from self-discrepancy via multiple co-teaching for cross-domain person re-identification. Machine Learning, 1\u201318.","DOI":"10.1007\/s10994-022-06184-x"},{"key":"6352_CR29","doi-asserted-by":"crossref","unstructured":"Xiang, S., Fu, Y., You, G., et\u00a0al. (2020). Unsupervised domain adaptation through synthesis for person re-identification. In IEEE international conference on multimedia and expo (pp. 1\u20136). IEEE.","DOI":"10.1109\/ICME46284.2020.9102822"},{"key":"6352_CR30","doi-asserted-by":"crossref","unstructured":"Xiang, S., Fu, Y., You, G., et al. (2021a). Taking a closer look at synthesis: Fine-grained attribute analysis for person re-identification. In IEEE International Conference on Acoustics, Speech and Signal Processing (pp. 3765\u20133769). IEEE.","DOI":"10.1109\/ICASSP39728.2021.9413757"},{"key":"6352_CR34","unstructured":"Xiang, S., Gao, J., Guan, M., et al. (2023b). Learning robust visual-semantic embedding for generalizable person re-identification. arXiv:2304.09498"},{"key":"6352_CR31","unstructured":"Xiang, S., Gao, J., Zhang, Z., et\u00a0al. (2021b). Rethinking person re-identification via semantic-based pretraining. arXiv:2110.05074"},{"key":"6352_CR32","doi-asserted-by":"crossref","unstructured":"Xiang, S., Qian, D., Guan, M., et al. (2023a). Less is more: Learning from synthetic data with fine-grained attributes for person re-identification. ACM Transactions on Multimedia Computing, Communications, and Applications, 19(5s), 1\u201320.","DOI":"10.1145\/3588441"},{"issue":"12","key":"6352_CR35","doi-asserted-by":"publisher","first-page":"1551","DOI":"10.1631\/FITEE.2100463","volume":"22","author":"Y Yang","year":"2021","unstructured":"Yang, Y., Zhuang, Y., & Pan, Y. (2021). Multiple knowledge representation for big data artificial intelligence: Framework, applications, and case studies. Frontiers of Information Technology and Electronic Engineering, 22(12), 1551\u20131558.","journal-title":"Frontiers of Information Technology and Electronic Engineering"},{"key":"6352_CR36","doi-asserted-by":"crossref","unstructured":"Yuan, Y., Chen, W., Chen, T., et\u00a0al. (2020). Calibrated domain-invariant learning for highly generalizable large scale re-identification. In IEEE\/CVF winter conference on applications of computer vision (pp. 3589\u20133598).","DOI":"10.1109\/WACV45572.2020.9093521"},{"key":"6352_CR37","doi-asserted-by":"crossref","unstructured":"Zhang, T., Xie, L., Wei, L., et\u00a0al. (2021). Unrealperson: An adaptive pipeline towards costless person re-identification. In IEEE conference on computer vision and pattern recognition (pp. 11,506\u201311,515).","DOI":"10.1109\/CVPR46437.2021.01134"},{"key":"6352_CR38","doi-asserted-by":"crossref","unstructured":"Zheng, L., Shen, L., Tian, L., et\u00a0al. (2015). Scalable person re-identification: A benchmark. In IEEE International Conference on Computer Vision (pp. 1116\u20131124).","DOI":"10.1109\/ICCV.2015.133"},{"issue":"1","key":"6352_CR39","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3159171","volume":"14","author":"Z Zheng","year":"2017","unstructured":"Zheng, Z., Zheng, L., & Yang, Y. (2017). A discriminatively learned cnn embedding for person reidentification. ACM Transactions on Multimedia Computing, Communications, and Applications, 14(1), 1\u201320.","journal-title":"ACM Transactions on Multimedia Computing, Communications, and Applications"},{"key":"6352_CR40","doi-asserted-by":"publisher","first-page":"1264","DOI":"10.1109\/TMM.2020.2995278","volume":"23","author":"H Zhong","year":"2020","unstructured":"Zhong, H., Chen, J., Shen, C., et al. (2020). Self-adaptive neural module transformer for visual question answering. IEEE Transactions on Multimedia, 23, 1264\u20131273.","journal-title":"IEEE Transactions on Multimedia"},{"key":"6352_CR41","doi-asserted-by":"crossref","unstructured":"Zhou, K., Yang, Y., Cavallaro, A., et\u00a0al. (2019). Omni-scale feature learning for person re-identification. In IEEE international conference on computer vision (pp. 3702\u20133712).","DOI":"10.1109\/ICCV.2019.00380"},{"key":"6352_CR42","doi-asserted-by":"crossref","unstructured":"Zhuang, Z., Wei, L., Xie, L., et\u00a0al. (2020). Rethinking the distribution gap of person re-identification with camera-based batch normalization. In European conference on computer vision (pp. 140\u2013157). Springer.","DOI":"10.1007\/978-3-030-58610-2_9"}],"container-title":["Machine Learning"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10994-023-06352-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10994-023-06352-7\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10994-023-06352-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,7,6]],"date-time":"2024-07-06T00:27:20Z","timestamp":1720225640000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10994-023-06352-7"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,7,6]]},"references-count":42,"journal-issue":{"issue":"4","published-print":{"date-parts":[[2024,4]]}},"alternative-id":["6352"],"URL":"https:\/\/doi.org\/10.1007\/s10994-023-06352-7","relation":{},"ISSN":["0885-6125","1573-0565"],"issn-type":[{"value":"0885-6125","type":"print"},{"value":"1573-0565","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023,7,6]]},"assertion":[{"value":"29 December 2022","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"24 April 2023","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"16 May 2023","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"6 July 2023","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that they have no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}},{"value":"All procedures performed in studies involving human participants were in accordance with the ethical standards of the institutional and\/or national research committee.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethics approval"}},{"value":"All human participants consented for participating in this study.","order":4,"name":"Ethics","group":{"name":"EthicsHeading","label":"Consent to participate"}},{"value":"All contents in this paper are consented for publication.","order":5,"name":"Ethics","group":{"name":"EthicsHeading","label":"Consent for publication"}},{"value":"This content has been made available to all.","name":"free","label":"Free to read"}]}}