{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,26]],"date-time":"2025-06-26T12:10:02Z","timestamp":1750939802616,"version":"3.41.0"},"publisher-location":"Singapore","reference-count":42,"publisher":"Springer Nature Singapore","isbn-type":[{"type":"print","value":"9789819615278"},{"type":"electronic","value":"9789819615285"}],"license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-981-96-1528-5_17","type":"book-chapter","created":{"date-parts":[[2025,2,14]],"date-time":"2025-02-14T17:23:41Z","timestamp":1739553821000},"page":"254-269","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Cross-Modal Mask and\u00a0Detail Alignment for\u00a0Text-Based Person Retrieval"],"prefix":"10.1007","author":[{"given":"Ao","family":"Guo","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xuan","family":"Liu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xianggan","family":"Liu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Bingmeng","family":"Hu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jie","family":"Yuan","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chiawei","family":"Chu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,2,15]]},"reference":[{"key":"17_CR1","doi-asserted-by":"crossref","unstructured":"Aggarwal, S., Radhakrishnan, V.B., Chakraborty, A.: Text-based person search via attribute-aided matching. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 2617\u20132625 (2020)","DOI":"10.1109\/WACV45572.2020.9093640"},{"key":"17_CR2","unstructured":"Bao, H., Dong, L., Piao, S., Wei, F.: Beit: bert pre-training of image transformers. arXiv preprint arXiv:2106.08254 (2021)"},{"key":"17_CR3","unstructured":"Cao, S., Xu, P., Clifton, D.A.: How to understand masked autoencoders. arXiv preprint arXiv:2202.03670 (2022)"},{"key":"17_CR4","doi-asserted-by":"crossref","unstructured":"Chen, J., et al.: Eve: Efficient vision-language pre-training with masked prediction and modality-aware moe. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a038, pp. 1110\u20131119 (2024)","DOI":"10.1609\/aaai.v38i2.27872"},{"key":"17_CR5","doi-asserted-by":"crossref","unstructured":"Chen, T., Xu, C., Luo, J.: Improving text-based person search by spatial matching and adaptive threshold. In: 2018 IEEE Winter Conference on Applications of Computer Vision (WACV), pp. 1879\u20131887. IEEE (2018)","DOI":"10.1109\/WACV.2018.00208"},{"key":"17_CR6","doi-asserted-by":"publisher","first-page":"4057","DOI":"10.1109\/TIP.2021.3068825","volume":"30","author":"Y Chen","year":"2021","unstructured":"Chen, Y., Huang, R., Chang, H., Tan, C., Xue, T., Ma, B.: Cross-modal knowledge adaptation for language-based person search. IEEE Trans. Image Process. 30, 4057\u20134069 (2021)","journal-title":"IEEE Trans. Image Process."},{"key":"17_CR7","doi-asserted-by":"publisher","first-page":"171","DOI":"10.1016\/j.neucom.2022.04.081","volume":"494","author":"Y Chen","year":"2022","unstructured":"Chen, Y., Zhang, G., Lu, Y., Wang, Z., Zheng, Y.: Tipcb: a simple but effective part-based convolutional baseline for text-based person search. Neurocomputing 494, 171\u2013181 (2022)","journal-title":"Neurocomputing"},{"key":"17_CR8","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: Bert: pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)"},{"key":"17_CR9","unstructured":"Diederik, P.K.: Adam: a method for stochastic optimization (2014)"},{"key":"17_CR10","unstructured":"Ding, Z., Ding, C., Shao, Z., Tao, D.: Semantically self-aligned network for text-to-image part-aware person re-identification. arXiv preprint arXiv:2107.12666 (2021)"},{"key":"17_CR11","doi-asserted-by":"crossref","unstructured":"Farooq, A., Awais, M., Kittler, J., Khalid, S.S.: Axm-net: implicit cross-modal feature alignment for person re-identification. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a036, pp. 4477\u20134485 (2022)","DOI":"10.1609\/aaai.v36i4.20370"},{"key":"17_CR12","unstructured":"Gray, D., Brennan, S., Tao, H.: Evaluating appearance models for recognition, reacquisition, and tracking. In: Proceedings of IEEE International Workshop on Performance Evaluation for Tracking and Surveillance (PETS), vol.\u00a03, pp.\u00a01\u20137 (2007)"},{"key":"17_CR13","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"17_CR14","doi-asserted-by":"crossref","unstructured":"Jiang, D., Ye, M.: Cross-modal implicit relation reasoning and aligning for text-to-image person retrieval. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2787\u20132797 (2023)","DOI":"10.1109\/CVPR52729.2023.00273"},{"key":"17_CR15","doi-asserted-by":"crossref","unstructured":"Jing, Y., Si, C., Wang, J., Wang, W., Wang, L., Tan, T.: Pose-guided multi-granularity attention network for text-based person search. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a034, pp. 11189\u201311196 (2020)","DOI":"10.1609\/aaai.v34i07.6777"},{"key":"17_CR16","doi-asserted-by":"crossref","unstructured":"Li, S., Cao, M., Zhang, M.: Learning semantic-aligned feature representation for text-based person search. In: ICASSP 2022-2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 2724\u20132728. IEEE (2022)","DOI":"10.1109\/ICASSP43922.2022.9746846"},{"key":"17_CR17","doi-asserted-by":"crossref","unstructured":"Li, S., Xiao, T., Li, H., Yang, W., Wang, X.: Identity-aware textual-visual matching with latent co-attention. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 1890\u20131899 (2017)","DOI":"10.1109\/ICCV.2017.209"},{"key":"17_CR18","doi-asserted-by":"crossref","unstructured":"Li, S., Xiao, T., Li, H., Zhou, B., Yue, D., Wang, X.: Person search with natural language description. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1970\u20131979 (2017)","DOI":"10.1109\/CVPR.2017.551"},{"key":"17_CR19","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"31","DOI":"10.1007\/978-3-642-37331-2_3","volume-title":"Computer Vision \u2013 ACCV 2012","author":"W Li","year":"2013","unstructured":"Li, W., Zhao, R., Wang, X.: Human reidentification with transferred metric learning. In: Lee, K.M., Matsushita, Y., Rehg, J.M., Hu, Z. (eds.) ACCV 2012. LNCS, vol. 7724, pp. 31\u201344. Springer, Heidelberg (2013). https:\/\/doi.org\/10.1007\/978-3-642-37331-2_3"},{"key":"17_CR20","doi-asserted-by":"publisher","unstructured":"Li, W., Zhao, R., Xiao, T., Wang, X.: Deepreid: deep filter pairing neural network for person re-identification. In: 2014 IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2014, Columbus, OH, USA, 23\u201328 June 2014, pp. 152\u2013159. IEEE Computer Society (2014). https:\/\/doi.org\/10.1109\/CVPR.2014.27","DOI":"10.1109\/CVPR.2014.27"},{"key":"17_CR21","doi-asserted-by":"crossref","unstructured":"Liu, J., Zha, Z.J., Hong, R., Wang, M., Zhang, Y.: Deep adversarial graph attention convolution network for text-based person search. In: Proceedings of the 27th ACM International Conference on Multimedia, pp. 665\u2013673 (2019)","DOI":"10.1145\/3343031.3350991"},{"key":"17_CR22","unstructured":"Ma, Z., Xu, F., Liu, J., Yang, M., Guo, Q.: Sycoca: symmetrizing contrastive captioners with attentive masking for multimodal alignment. arXiv preprint arXiv:2401.02137 (2024)"},{"issue":"8","key":"17_CR23","first-page":"1735","volume":"9","author":"LST Memory","year":"2010","unstructured":"Memory, L.S.T.: Long short-term memory. Neural Comput. 9(8), 1735\u20131780 (2010)","journal-title":"Neural Comput."},{"key":"17_CR24","doi-asserted-by":"crossref","unstructured":"Sarafianos, N., Xu, X., Kakadiaris, I.A.: Adversarial representation learning for text-to-image matching. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 5814\u20135824 (2019)","DOI":"10.1109\/ICCV.2019.00591"},{"key":"17_CR25","doi-asserted-by":"crossref","unstructured":"Shao, Z., Zhang, X., Fang, M., Lin, Z., Wang, J., Ding, C.: Learning granularity-unified representations for text-to-image person re-identification. In: Proceedings of the 30th ACM International Conference on Multimedia, pp. 5566\u20135574 (2022)","DOI":"10.1145\/3503161.3548028"},{"key":"17_CR26","doi-asserted-by":"publisher","unstructured":"Shu, X., et al.: See finer, see more: implicit modality alignment for text-based person retrieval. In: European Conference on Computer Vision, pp. 624\u2013641. Springer, Heidelberg (2022). https:\/\/doi.org\/10.1007\/978-3-031-25072-9_42","DOI":"10.1007\/978-3-031-25072-9_42"},{"key":"17_CR27","unstructured":"Simonyan, K., Zisserman, A.: Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556 (2014)"},{"key":"17_CR28","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"402","DOI":"10.1007\/978-3-030-58610-2_24","volume-title":"Computer Vision \u2013 ECCV 2020","author":"Z Wang","year":"2020","unstructured":"Wang, Z., Fang, Z., Wang, J., Yang, Y.: ViTAA: visual-textual attributes alignment in person search by natural language. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12357, pp. 402\u2013420. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58610-2_24"},{"key":"17_CR29","doi-asserted-by":"crossref","unstructured":"Wang, Z., et al.: Caibc: capturing all-round information beyond color for text-based person retrieval. In: Proceedings of the 30th ACM International Conference on Multimedia, pp. 5314\u20135322 (2022)","DOI":"10.1145\/3503161.3548057"},{"key":"17_CR30","doi-asserted-by":"crossref","unstructured":"Wang, Z., et al.: Look before you leap: improving text-based person retrieval by learning a consistent cross-modal common manifold. In: Proceedings of the 30th ACM International Conference on Multimedia, pp. 1984\u20131992 (2022)","DOI":"10.1145\/3503161.3548166"},{"key":"17_CR31","doi-asserted-by":"crossref","unstructured":"Wei, L., Zhang, S., Gao, W., Tian, Q.: Person transfer gan to bridge domain gap for person re-identification. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 79\u201388 (2018)","DOI":"10.1109\/CVPR.2018.00016"},{"key":"17_CR32","doi-asserted-by":"crossref","unstructured":"Wu, Y., Yan, Z., Han, X., Li, G., Zou, C., Cui, S.: Lapscore: language-guided person search via color reasoning. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 1624\u20131633 (2021)","DOI":"10.1109\/ICCV48922.2021.00165"},{"key":"17_CR33","unstructured":"Xiao, T., Li, S., Wang, B., Lin, L., Wang, X.: End-to-end deep learning for person search. CoRR arxiv:1604.01850 (2016)"},{"key":"17_CR34","doi-asserted-by":"crossref","unstructured":"Xie, Z., et al.: Simmim: a simple framework for masked image modeling. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9653\u20139663 (2022)","DOI":"10.1109\/CVPR52688.2022.00943"},{"key":"17_CR35","doi-asserted-by":"crossref","unstructured":"Yan, S., Dong, N., Zhang, L., Tang, J.: Clip-driven fine-grained text-image person re-identification. IEEE Trans. Image Process. (2023)","DOI":"10.1109\/TIP.2023.3327924"},{"key":"17_CR36","doi-asserted-by":"crossref","unstructured":"Yan, S., Tang, H., Zhang, L., Tang, J.: Image-specific information suppression and implicit local alignment for text-based person search. IEEE Trans. Neural Netw. Learn. Syst. (2023)","DOI":"10.1109\/TNNLS.2023.3310118"},{"key":"17_CR37","doi-asserted-by":"crossref","unstructured":"Zhang, Y., Lu, H.: Deep cross-modal projection learning for image-text matching. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 686\u2013701 (2018)","DOI":"10.1007\/978-3-030-01246-5_42"},{"key":"17_CR38","doi-asserted-by":"crossref","unstructured":"Zheng, K., Liu, W., Liu, J., Zha, Z.J., Mei, T.: Hierarchical gumbel attention network for text-based person search. In: Proceedings of the 28th ACM International Conference on Multimedia, pp. 3441\u20133449 (2020)","DOI":"10.1145\/3394171.3413864"},{"key":"17_CR39","unstructured":"Zheng, L., Shen, L., Tian, L., Wang, S., Bu, J., Tian, Q.: Person re-identification meets image search. CoRR arxiv:1502.02171 (2015)"},{"key":"17_CR40","doi-asserted-by":"publisher","unstructured":"Zheng, Y., et al.: CPCL: cross-modal prototypical contrastive learning for weakly supervised text-based person re-identification. CoRR arxiv:2401.10011 (2024). https:\/\/doi.org\/10.48550\/ARXIV.2401.10011","DOI":"10.48550\/ARXIV.2401.10011"},{"key":"17_CR41","doi-asserted-by":"crossref","unstructured":"Zhu, A., et al.: DSSL: deep surroundings-person separation learning for text-based person retrieval. In: Proceedings of the 29th ACM International Conference on Multimedia, pp. 209\u2013217 (2021)","DOI":"10.1145\/3474085.3475369"},{"key":"17_CR42","doi-asserted-by":"publisher","unstructured":"Zuo, J., Yu, C., Sang, N., Gao, C.: PLIP: language-image pre-training for person representation learning. CoRR arxiv:2305.08386 (2023). https:\/\/doi.org\/10.48550\/ARXIV.2305.08386","DOI":"10.48550\/ARXIV.2305.08386"}],"container-title":["Lecture Notes in Computer Science","Algorithms and Architectures for Parallel Processing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-96-1528-5_17","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,26]],"date-time":"2025-06-26T11:29:30Z","timestamp":1750937370000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-96-1528-5_17"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"ISBN":["9789819615278","9789819615285"],"references-count":42,"URL":"https:\/\/doi.org\/10.1007\/978-981-96-1528-5_17","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2025]]},"assertion":[{"value":"15 February 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICA3PP","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Algorithms and Architectures for Parallel Processing","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Macau","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"30 October 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"1 November 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"24","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"ica3pp2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/ica3pp2024.scimeeting.cn\/en\/web\/index\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}