{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,8]],"date-time":"2025-12-08T07:17:40Z","timestamp":1765178260191,"version":"3.40.3"},"publisher-location":"Singapore","reference-count":33,"publisher":"Springer Nature Singapore","isbn-type":[{"type":"print","value":"9789819609598"},{"type":"electronic","value":"9789819609604"}],"license":[{"start":{"date-parts":[[2024,12,8]],"date-time":"2024-12-08T00:00:00Z","timestamp":1733616000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,12,8]],"date-time":"2024-12-08T00:00:00Z","timestamp":1733616000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-981-96-0960-4_12","type":"book-chapter","created":{"date-parts":[[2024,12,7]],"date-time":"2024-12-07T07:34:54Z","timestamp":1733556894000},"page":"190-205","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["CrossPAR: Enhancing Pedestrian Attribute Recognition with\u00a0Vision-Language Fusion and\u00a0Human-Centric Pre-training"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-2290-1187","authenticated-orcid":false,"given":"Bach-Hoang","family":"Ngo","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Si-Tri","family":"Ngo","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Phu-Duc","family":"Le","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Quang-Minh","family":"Phan","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3046-3041","authenticated-orcid":false,"given":"Minh-Triet","family":"Tran","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7363-2610","authenticated-orcid":false,"given":"Trung-Nghia","family":"Le","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,12,8]]},"reference":[{"key":"12_CR1","doi-asserted-by":"crossref","unstructured":"Bai, Y., Cao, M., Gao, D., Cao, Z., Chen, C., Fan, Z., Nie, L., Zhang, M.: Rasa: Relation and sensitivity aware representation learning for text-based person search. arXiv preprint arXiv:2305.13653 (2023)","DOI":"10.24963\/ijcai.2023\/62"},{"key":"12_CR2","doi-asserted-by":"crossref","unstructured":"Bui, D.C., Le, T.V., Ngo, B.H.: C2t-net: Channel-aware cross-fused transformer-style networks for pedestrian attribute recognition. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision (WACV) Workshops. pp. 351\u2013358 (January 2024)","DOI":"10.1109\/WACVW60836.2024.00043"},{"key":"12_CR3","doi-asserted-by":"crossref","unstructured":"Chen, W., Xu, X., Jia, J., Luo, H., Wang, Y., Wang, F., Jin, R., Sun, X.: Beyond appearance: A semantic controllable self-supervised learning framework for human-centric visual tasks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). pp. 15050\u201315061 (June 2023)","DOI":"10.1109\/CVPR52729.2023.01445"},{"key":"12_CR4","doi-asserted-by":"crossref","unstructured":"Ci, Y., Wang, Y., Chen, M., Tang, S., Bai, L., Zhu, F., Zhao, R., Yu, F., Qi, D., Ouyang, W.: Unihcp: A unified model for human-centric perceptions. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). pp. 17840\u201317852 (June 2023)","DOI":"10.1109\/CVPR52729.2023.01711"},{"key":"12_CR5","doi-asserted-by":"publisher","unstructured":"DENG, Y., Luo, P., Loy, C.C., Tang, X.: Pedestrian attribute recognition at far distance. In: Proceedings of the 22nd ACM International Conference on Multimedia. p. 789\u2013792. MM \u201914, Association for Computing Machinery, New York, NY, USA (2014). https:\/\/doi.org\/10.1145\/2647868.2654966, https:\/\/doi.org\/10.1145\/2647868.2654966","DOI":"10.1145\/2647868.2654966"},{"key":"12_CR6","unstructured":"Ding, Z., Ding, C., Shao, Z., Tao, D.: Semantically self-aligned network for text-to-image part-aware person re-identification. arXiv preprint arXiv:2107.12666 (2021)"},{"key":"12_CR7","doi-asserted-by":"crossref","unstructured":"Djenouri, Y., Belbachir, A.N.: A hybrid visual transformer for efficient deep human activity recognition. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV) Workshops. pp. 721\u2013730 (October 2023)","DOI":"10.1109\/ICCVW60793.2023.00080"},{"key":"12_CR8","unstructured":"Dong, X., Bao, J., Zhang, T., Chen, D., Shuyang, G., Zhang, W., Yuan, L., Chen, D., Wen, F., Yu, N.: Clip itself is a strong fine-tuner: Achieving 85.7 88.0 arXiv:2212.06138 (2022)"},{"key":"12_CR9","doi-asserted-by":"crossref","unstructured":"Guo, H., Zheng, K., Fan, X., Yu, H., Wang, S.: Visual attention consistency under image transforms for multi-label image classification. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (June 2019)","DOI":"10.1109\/CVPR.2019.00082"},{"key":"12_CR10","unstructured":"Jia, C., Yang, Y., Xia, Y., Chen, Y.T., Parekh, Z., Pham, H., Le, Q.V., Sung, Y.H., Li, Z., Duerig, T.: Scaling up visual and vision-language representation learning with noisy text supervision. ArXiv abs\/2102.05918 (2021), https:\/\/api.semanticscholar.org\/CorpusID:231879586"},{"key":"12_CR11","doi-asserted-by":"crossref","unstructured":"Jia, J., Chen, X., Huang, K.: Spatial and semantic consistency regularizations for pedestrian attribute recognition. In: Proceedings of the IEEE\/CVF international conference on computer vision. pp. 962\u2013971 (2021)","DOI":"10.1109\/ICCV48922.2021.00100"},{"key":"12_CR12","unstructured":"Jia, J., Huang, H., Chen, X., Huang, K.: Rethinking of pedestrian attribute recognition: A reliable evaluation under zero-shot pedestrian identity setting. arXiv preprint arXiv:2107.03576 (2021)"},{"key":"12_CR13","doi-asserted-by":"crossref","unstructured":"Jiang, D., Ye, M.: Cross-modal implicit relation reasoning and aligning for text-to-image person retrieval. In: IEEE International Conference on Computer Vision and Pattern Recognition (CVPR) (2023)","DOI":"10.1109\/CVPR52729.2023.00273"},{"key":"12_CR14","doi-asserted-by":"crossref","unstructured":"Jiang, D., Ye, M.: Cross-modal implicit relation reasoning and aligning for text-to-image person retrieval. In: IEEE International Conference on Computer Vision and Pattern Recognition (CVPR) (2023)","DOI":"10.1109\/CVPR52729.2023.00273"},{"key":"12_CR15","unstructured":"Li, D., Zhang, Z., Chen, X., Ling, H., Huang, K.: A richly annotated dataset for pedestrian attribute recognition. ArXiv abs\/1603.07054 (2016)"},{"key":"12_CR16","doi-asserted-by":"crossref","unstructured":"Li, D., Chen, X., Huang, K.: Multi-attribute learning for pedestrian attribute recognition in surveillance scenarios. 2015 3rd IAPR Asian Conference on Pattern Recognition (ACPR) pp. 111\u2013115 (2015), https:\/\/api.semanticscholar.org\/CorpusID:9475404","DOI":"10.1109\/ACPR.2015.7486476"},{"key":"12_CR17","doi-asserted-by":"crossref","unstructured":"Li, S., Xiao, T., Li, H., Zhou, B., Yue, D., Wang, X.: Person search with natural language description. arXiv preprint arXiv:1702.05729 (2017)","DOI":"10.1109\/CVPR.2017.551"},{"key":"12_CR18","unstructured":"Li, Y., Liang, F., Zhao, L., Cui, Y., Ouyang, W., Shao, J., Yu, F., Yan, J.: Supervision exists everywhere: A data efficient contrastive language-image pre-training paradigm. In: International Conference on Learning Representations (2022), https:\/\/openreview.net\/forum?id=zq1iJkNk3uN"},{"key":"12_CR19","doi-asserted-by":"crossref","unstructured":"Liu, X., Zhao, H., Tian, M., Sheng, L., Shao, J., Yi, S., Yan, J., Wang, X.: Hydraplus-net: Attentive deep features for pedestrian analysis. In: Proceedings of the IEEE International Conference on Computer Vision (ICCV) (Oct 2017)","DOI":"10.1109\/ICCV.2017.46"},{"key":"12_CR20","doi-asserted-by":"crossref","unstructured":"Liu, Z., Lin, Y., Cao, Y., Hu, H., Wei, Y., Zhang, Z., Lin, S., Guo, B.: Swin transformer: Hierarchical vision transformer using shifted windows. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV) (2021)","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"12_CR21","doi-asserted-by":"publisher","unstructured":"Liu, Z., Zhang, Z., Li, D., Zhang, P., Shan, C.: Dual-branch self-attention network for pedestrian attribute recognition. Pattern Recogn. Lett. p. 112\u2013120 (nov 2022). https:\/\/doi.org\/10.1016\/j.patrec.2022.10.003","DOI":"10.1016\/j.patrec.2022.10.003"},{"key":"12_CR22","doi-asserted-by":"publisher","unstructured":"Mordan, T., Cord, M., P\u00e9rez, P., Alahi, A.: Detecting 32 pedestrian attributes for autonomous vehicles. IEEE Transactions on Intelligent Transportation Systems (T-ITS) (2021). https:\/\/doi.org\/10.1109\/TITS.2021.3107587","DOI":"10.1109\/TITS.2021.3107587"},{"key":"12_CR23","unstructured":"Radford, A., Kim, J.W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., Sastry, G., Askell, A., Mishkin, P., Clark, J., Krueger, G., Sutskever, I.: Learning transferable visual models from natural language supervision. In: ICML (2021)"},{"key":"12_CR24","doi-asserted-by":"crossref","unstructured":"Specker, A., Cormier, M., Beyerer, J.: Upar: Unified pedestrian attribute recognition and person retrieval. 2023 IEEE\/CVF Winter Conference on Applications of Computer Vision (WACV) pp. 981\u2013990 (2022), https:\/\/api.semanticscholar.org\/CorpusID:252090333","DOI":"10.1109\/WACV56688.2023.00104"},{"key":"12_CR25","unstructured":"Sun, Q., Fang, Y., Wu, L., Wang, X., Cao, Y.: Eva-clip: Improved training techniques for clip at scale. arXiv preprint arXiv:2303.15389 (2023)"},{"key":"12_CR26","first-page":"12055","volume":"34","author":"Z Tan","year":"2020","unstructured":"Tan, Z., Yang, Y., Wan, J., Guo, G., Li, S.Z.: Relation-aware pedestrian attribute recognition with graph convolutional networks 34, 12055\u201312062 (2020)","journal-title":"Relation-aware pedestrian attribute recognition with graph convolutional networks"},{"key":"12_CR27","doi-asserted-by":"publisher","unstructured":"Tan, Z., Yang, Y., Wan, J., Guo, G., Li, S.Z.: Relation-aware pedestrian attribute recognition with graph convolutional networks. Proceedings of the AAAI Conference on Artificial Intelligence 34(07), 12055\u201312062 (Apr2020). https:\/\/doi.org\/10.1609\/aaai.v34i07.6883, https:\/\/ojs.aaai.org\/index.php\/AAAI\/article\/view\/6883","DOI":"10.1609\/aaai.v34i07.6883"},{"key":"12_CR28","doi-asserted-by":"crossref","unstructured":"Tang, C., Sheng, L., Zhang, Z., Hu, X.: Improving pedestrian attribute recognition with weakly-supervised multi-scale attribute-specific localization. In: Proceedings of the IEEE International Conference on Computer Vision. pp. 4997\u20135006 (2019)","DOI":"10.1109\/ICCV.2019.00510"},{"key":"12_CR29","doi-asserted-by":"crossref","unstructured":"Tang, S., Chen, C., Xie, Q., Chen, M., Wang, Y., Ci, Y., Bai, L., Zhu, F., Yang, H., Yi, L., Zhao, R., Ouyang, W.: Humanbench: Towards general human-centric perception with projector assisted pretraining. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). pp. 21970\u201321982 (June 2023)","DOI":"10.1109\/CVPR52729.2023.02104"},{"key":"12_CR30","doi-asserted-by":"crossref","unstructured":"Wang, W., Bao, H., Dong, L., Bjorck, J., Peng, Z., Liu, Q., Aggarwal, K., Mohammed, O.K., Singhal, S., Som, S., Wei, F.: Image as a foreign language: Beit pretraining for vision and vision-language tasks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). pp. 19175\u201319186 (June 2023)","DOI":"10.1109\/CVPR52729.2023.01838"},{"key":"12_CR31","doi-asserted-by":"crossref","unstructured":"Wang, W., Bao, H., Dong, L., Bjorck, J., Peng, Z., Liu, Q., Aggarwal, K., Mohammed, O.K., Singhal, S., Som, S., Wei, F.: Image as a foreign language: BEiT pretraining for vision and vision-language tasks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2023)","DOI":"10.1109\/CVPR52729.2023.01838"},{"key":"12_CR32","doi-asserted-by":"crossref","unstructured":"Yang, S., Zhou, Y., Wang, Y., Wu, Y., Zhu, L., Zheng, Z.: Towards unified text-based person retrieval: A large-scale multi-attribute and language search benchmark. In: Proceedings of the 2023 ACM on Multimedia Conference (2023)","DOI":"10.1145\/3581783.3611709"},{"key":"12_CR33","unstructured":"Zeng, Y., Zhang, X., Li, H., Wang, J., Zhang, J., Zhou, W.: X2-vlm: All-in-one pre-trained model for vision-language tasks. arXiv preprint arXiv:2211.12402 (2022)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ACCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-96-0960-4_12","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,12,7]],"date-time":"2024-12-07T08:32:41Z","timestamp":1733560361000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-96-0960-4_12"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,8]]},"ISBN":["9789819609598","9789819609604"],"references-count":33,"URL":"https:\/\/doi.org\/10.1007\/978-981-96-0960-4_12","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,12,8]]},"assertion":[{"value":"8 December 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ACCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Asian Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Hanoi","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Vietnam","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"8 December 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"12 December 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"17","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"accv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}