{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,25]],"date-time":"2025-03-25T22:58:01Z","timestamp":1742943481634,"version":"3.40.3"},"publisher-location":"Singapore","reference-count":37,"publisher":"Springer Nature Singapore","isbn-type":[{"type":"print","value":"9789819786190"},{"type":"electronic","value":"9789819786206"}],"license":[{"start":{"date-parts":[[2024,10,20]],"date-time":"2024-10-20T00:00:00Z","timestamp":1729382400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,20]],"date-time":"2024-10-20T00:00:00Z","timestamp":1729382400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-981-97-8620-6_17","type":"book-chapter","created":{"date-parts":[[2024,10,19]],"date-time":"2024-10-19T21:02:10Z","timestamp":1729371730000},"page":"246-259","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Cross-Modal Dual Matching and\u00a0Comparison for\u00a0Text-to-Image Person Re-identification"],"prefix":"10.1007","author":[{"given":"Lin","family":"Cao","sequence":"first","affiliation":[]},{"given":"Wenwen","family":"Sun","sequence":"additional","affiliation":[]},{"given":"Yanan","family":"Guo","sequence":"additional","affiliation":[]},{"given":"Shoujing","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Boqian","family":"Lv","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,10,20]]},"reference":[{"key":"17_CR1","doi-asserted-by":"publisher","first-page":"6025","DOI":"10.1109\/TMM.2023.3344354","volume":"26","author":"G Han","year":"2023","unstructured":"Han, G., Lin, M., Li, Z., Zhao, H., Kwong, S.: Text-to-image person re-identification based on multimodal graph convolutional network. IEEE Trans. Multimedia 26, 6025\u20136036 (2023)","journal-title":"IEEE Trans. Multimedia"},{"doi-asserted-by":"crossref","unstructured":"Shao, Z., Zhang, X., Ding, C., Wang, J., Wang, J.: Unified pre-training with pseudo texts for text-to-image person re-identification. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 11174\u201311184 (2023)","key":"17_CR2","DOI":"10.1109\/ICCV51070.2023.01026"},{"doi-asserted-by":"crossref","unstructured":"Yan, S., Tang, H., Zhang, L., Tang, J.: Image-specific information suppression and implicit local alignment for text-based person search. IEEE Trans. Neural. Netw. Learn. Syst. (2023)","key":"17_CR3","DOI":"10.1109\/TNNLS.2023.3310118"},{"doi-asserted-by":"crossref","unstructured":"Zhang, Y., Lu, H.: Deep cross-modal projection learning for image-text matching. In: Proceedings of the European Conference on Computer Vision, pp. 686\u2013701 (2018)","key":"17_CR4","DOI":"10.1007\/978-3-030-01246-5_42"},{"doi-asserted-by":"crossref","unstructured":"Sarafianos, N., Xu, X., Kakadiaris, I.A.: Adversarial representation learning for text-to-image matching. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 5814\u20135824 (2019)","key":"17_CR5","DOI":"10.1109\/ICCV.2019.00591"},{"unstructured":"Han, X., He, S., Zhang, L.: (2021). arXiv:2110.10807","key":"17_CR6"},{"doi-asserted-by":"crossref","unstructured":"Chen, T., Xu, C., Luo, J.: Improving text-based person search by spatial matching and adaptive threshold. In: Proceedings of the Winter Conference on Applications of Computer Vision, pp. 1879\u20131887 (2018)","key":"17_CR7","DOI":"10.1109\/WACV.2018.00208"},{"key":"17_CR8","doi-asserted-by":"publisher","first-page":"5542","DOI":"10.1109\/TIP.2020.2984883","volume":"29","author":"K Niu","year":"2020","unstructured":"Niu, K., Huang, Y., Ouyang, W., Wang, L.: Improving description-based person re-identification by multi-granularity image-text alignments. IEEE Trans. Image Process. 29, 5542\u20135556 (2020)","journal-title":"IEEE Trans. Image Process."},{"doi-asserted-by":"crossref","unstructured":"Aggarwal, S., Radhakrishnan, V.B., Chakraborty, A.: Text-based person search via attribute-aided matching. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 2617\u20132625 (2020)","key":"17_CR9","DOI":"10.1109\/WACV45572.2020.9093640"},{"doi-asserted-by":"crossref","unstructured":"Wang, Z., Fang, Z., Wang, J., Yang, Y.: Vitaa: visual-textual attributes alignment in person search by natural language. In: Proceedings of the European Conference on Computer Vision, pp. 402\u2013420 (2020)","key":"17_CR10","DOI":"10.1007\/978-3-030-58610-2_24"},{"doi-asserted-by":"crossref","unstructured":"Jing, Y., Si, C., Wang, J., Wang, W., Wang, L., Tan, T.: Pose-guided multi-granularity attention network for text-based person search. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a034, pp. 11189\u201311196 (2020)","key":"17_CR11","DOI":"10.1609\/aaai.v34i07.6777"},{"key":"17_CR12","doi-asserted-by":"publisher","first-page":"4057","DOI":"10.1109\/TIP.2021.3068825","volume":"30","author":"Y Chen","year":"2021","unstructured":"Chen, Y., Huang, R., Chang, H., Tan, C., Xue, T., Ma, B.: Cross-modal knowledge adaptation for language-based person search. IEEE Trans. Image Process. 30, 4057\u20134069 (2021)","journal-title":"IEEE Trans. Image Process."},{"doi-asserted-by":"crossref","unstructured":"Wu, Y., Yan, Z., Han, X., Li, G., Zou, C., Cui, S.: Lapscore: language-guided person search via color reasoning. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 1624\u20131633 (2021)","key":"17_CR13","DOI":"10.1109\/ICCV48922.2021.00165"},{"doi-asserted-by":"crossref","unstructured":"Jiang, D., Ye, M.: Cross-modal implicit relation reasoning and aligning for text-to-image person retrieval. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2787\u20132797 (2023)","key":"17_CR14","DOI":"10.1109\/CVPR52729.2023.00273"},{"doi-asserted-by":"crossref","unstructured":"Lin, D., Peng, Y., Meng, J., Zheng, W.S.: Cross-modal adaptive dual association for text-to-image person retrieval. IEEE Trans. Multimedia (2024)","key":"17_CR15","DOI":"10.1109\/TMM.2024.3355644"},{"issue":"1","key":"17_CR16","doi-asserted-by":"publisher","first-page":"38","DOI":"10.1007\/s11633-022-1369-5","volume":"20","author":"FL Chen","year":"2023","unstructured":"Chen, F.L., Zhang, D.Z., Han, M.L., Chen, X.Y., Shi, J., Xu, S., Xu, B.: Vlp: a survey on vision-language pre-training. Mach. Intell. Res. 20(1), 38\u201356 (2023)","journal-title":"Mach. Intell. Res."},{"unstructured":"Radford, A., Kim, J.W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., Sastry, G., Askell, A., Mishkin, P., Clark, J., et\u00a0al.: Learning transferable visual models from natural language supervision. In: Proceedings of the International Conference on Machine Learning, pp. 8748\u20138763 (2021)","key":"17_CR17"},{"doi-asserted-by":"crossref","unstructured":"Li, W., Gao, C., Niu, G., Xiao, X., Liu, H., Liu, J., Wu, H., Wang, H.: Unimo: towards unified-modal understanding and generation via cross-modal contrastive learning (2020). arXiv:2012.15409","key":"17_CR18","DOI":"10.18653\/v1\/2021.acl-long.202"},{"unstructured":"Yuan, L., Chen, D., Chen, Y.L., Codella, N., Dai, X., Gao, J., Hu, H., Huang, X., Li, B., Li, C., et\u00a0al.: Florence: a new foundation model for computer vision (2021). arXiv:2111.11432","key":"17_CR19"},{"unstructured":"Jia, C., Yang, Y., Xia, Y., Chen, Y.T., Parekh, Z., Pham, H., Le, Q., Sung, Y.H., Li, Z., Duerig, T.: Scaling up visual and vision-language representation learning with noisy text supervision. In: Proceedings of the International Conference on Machine Learning, pp. 4904\u20134916 (2021)","key":"17_CR20"},{"key":"17_CR21","first-page":"9694","volume":"34","author":"J Li","year":"2021","unstructured":"Li, J., Selvaraju, R., Gotmare, A., Joty, S., Xiong, C., Hoi, S.C.H.: Align before fuse: vision and language representation learning with momentum distillation. Adv. Neural. Inf. Process. Syst. 34, 9694\u20139705 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.: Blip-2: bootstrapping language-image pre-training with frozen image encoders and large language models. In: Proceedings of the International Conference on Machine Learning, pp. 19730\u201319742 (2023)","key":"17_CR22"},{"unstructured":"Yu, J., Wang, Z., Vasudevan, V., Yeung, L., Seyedhosseini, M., Wu, Y.: Coca: Contrastive captioners are image-text foundation models (2022). arXiv:2205.01917","key":"17_CR23"},{"doi-asserted-by":"crossref","unstructured":"Wang, W., Bao, H., Dong, L., Bjorck, J., Peng, Z., Liu, Q., Aggarwal, K., Mohammed, O.K., Singhal, S., Som, S., et\u00a0al.: Image as a foreign language: beit pretraining for vision and vision-language tasks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 19175\u201319186 (2023)","key":"17_CR24","DOI":"10.1109\/CVPR52729.2023.01838"},{"doi-asserted-by":"crossref","unstructured":"Li, S., Xiao, T., Li, H., Zhou, B., Yue, D., Wang, X.: Person search with natural language description. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1970\u20131979 (2017)","key":"17_CR25","DOI":"10.1109\/CVPR.2017.551"},{"doi-asserted-by":"crossref","unstructured":"Zhu, A., Wang, Z., Li, Y., Wan, X., Jin, J., Wang, T., Hu, F., Hua, G.: Dssl: deep surroundings-person separation learning for text-based person retrieval. In: Proceedings of the ACM International Conference on Multimedia, pp. 209\u2013217 (2021)","key":"17_CR26","DOI":"10.1145\/3474085.3475369"},{"doi-asserted-by":"crossref","unstructured":"Lee, K.H., Chen, X., Hua, G., Hu, H., He, X.: Stacked cross attention for image-text matching. In: Proceedings of the European Conference on Computer Vision, pp. 201\u2013216 (2018)","key":"17_CR27","DOI":"10.1007\/978-3-030-01225-0_13"},{"doi-asserted-by":"crossref","unstructured":"Shu, X., Wen, W., Wu, H., Chen, K., Song, Y., Qiao, R., Ren, B., Wang, X.: See finer, see more: implicit modality alignment for text-based person retrieval. In: Proceedings of the European Conference on Computer Vision, pp. 624\u2013641 (2022)","key":"17_CR28","DOI":"10.1007\/978-3-031-25072-9_42"},{"doi-asserted-by":"crossref","unstructured":"Wang, Z., Zhu, A., Xue, J., Wan, X., Liu, C., Wang, T., Li, Y.: Look before you leap: improving text-based person retrieval by learning a consistent cross-modal common manifold. In: Proceedings of the ACM International Conference on Multimedia, pp. 1984\u20131992 (2022)","key":"17_CR29","DOI":"10.1145\/3503161.3548166"},{"doi-asserted-by":"crossref","unstructured":"Li, S., Cao, M., Zhang, M.: Learning semantic-aligned feature representation for text-based person search. In: Proceedings of the International Conference on Acoustics, Speech and Signal Processing, pp. 2724\u20132728 (2022)","key":"17_CR30","DOI":"10.1109\/ICASSP43922.2022.9746846"},{"key":"17_CR31","doi-asserted-by":"publisher","first-page":"171","DOI":"10.1016\/j.neucom.2022.04.081","volume":"494","author":"Y Chen","year":"2022","unstructured":"Chen, Y., Zhang, G., Lu, Y., Wang, Z., Zheng, Y.: Tipcb: simple but effective part-based convolutional baseline for text-based person searcha. Neurocomputing 494, 171\u2013181 (2022)","journal-title":"Neurocomputing"},{"doi-asserted-by":"crossref","unstructured":"Wang, Z., Zhu, A., Xue, J., Wan, X., Liu, C., Wang, T., Li, Y.: Caibc: capturing all-round information beyond color for text-based person retrieval. In: Proceedings of the ACM International Conference on Multimedia, pp. 5314\u20135322 (2022)","key":"17_CR32","DOI":"10.1145\/3503161.3548057"},{"doi-asserted-by":"crossref","unstructured":"Farooq, A., Awais, M., Kittler, J., Khalid, S.S.: Axm-net: Implicit cross-modal feature alignment for person re-identification. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a036, pp. 4477\u20134485 (2022)","key":"17_CR33","DOI":"10.1609\/aaai.v36i4.20370"},{"doi-asserted-by":"crossref","unstructured":"Farooq, A., Awais, M., Kittler, J., Khalid, S.S.: Axm-net: implicit cross-modal feature alignment for person re-identification. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a036, pp. 4477\u20134485 (2022)","key":"17_CR34","DOI":"10.1609\/aaai.v36i4.20370"},{"doi-asserted-by":"crossref","unstructured":"Shen, F., Shu, X., Du, X., Tang, J.: Pedestrian-specific bipartite-aware similarity learning for text-based person retrieval. In: Proceedings of the ACM International Conference on Multimedia, pp. 8922\u20138931 (2023)","key":"17_CR35","DOI":"10.1145\/3581783.3612009"},{"doi-asserted-by":"crossref","unstructured":"Ma, Y., Sun, X., Ji, J., Jiang, G., Zhuang, W., Ji, R.: Beat: Bi-directional one-to-many embedding alignment for text-based person retrieval. In: Proceedings of the ACM International Conference on Multimedia, pp. 4157\u20134168 (2023)","key":"17_CR36","DOI":"10.1145\/3581783.3611768"},{"doi-asserted-by":"crossref","unstructured":"Wu, H., Chen, W., Liu, Z., Chen, T., Chen, Z., Lin, L.: Contrastive transformer learning with proximity data generation for text-based person search. IEEE Trans. Circuits Syst. Video Technol. (2023)","key":"17_CR37","DOI":"10.1109\/TCSVT.2023.3329220"}],"container-title":["Lecture Notes in Computer Science","Pattern Recognition and Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-97-8620-6_17","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,1,14]],"date-time":"2025-01-14T20:19:03Z","timestamp":1736885943000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-97-8620-6_17"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,20]]},"ISBN":["9789819786190","9789819786206"],"references-count":37,"URL":"https:\/\/doi.org\/10.1007\/978-981-97-8620-6_17","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,10,20]]},"assertion":[{"value":"20 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"PRCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Chinese Conference on Pattern Recognition and Computer Vision  (PRCV)","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Urumqi","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18 October 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"20 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"7","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"ccprcv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/2024.prcv.cn\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}