{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T05:10:14Z","timestamp":1765343414260,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":41,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["2376004"],"award-info":[{"award-number":["2376004"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3754721","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:26:55Z","timestamp":1761377215000},"page":"2723-2731","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Learning Hierarchical Cross-modal Association with Intra-modal Context for Text-Image Person Retrieval"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-4949-9995","authenticated-orcid":false,"given":"Yifei","family":"Deng","sequence":"first","affiliation":[{"name":"State Key Laboratory of Opto-Electronic Information Acquisition and Protection Technology, Anhui University, Hefei, China and School of Computer Science and Technology, Anhui University, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7233-2739","authenticated-orcid":false,"given":"Chenglong","family":"Li","sequence":"additional","affiliation":[{"name":"State Key Laboratory of Opto-Electronic Information Acquisition and Protection Technology, Anhui University, Hefei, China and School of Artificial Intelligence, Anhui University, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4181-8485","authenticated-orcid":false,"given":"Futian","family":"Wang","sequence":"additional","affiliation":[{"name":"Anhui Provincial Key Laboratory of Multimodal Cognitive Computation, Anhui University, Hefei, China and School of Computer Science and Technology, Anhui University, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8375-3590","authenticated-orcid":false,"given":"Jin","family":"Tang","sequence":"additional","affiliation":[{"name":"Anhui Provincial Key Laboratory of Multimodal Cognitive Computation, Anhui University, Hefei, China and School of Computer Science and Technology, Anhui University, Hefei, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2022.04.081"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1007\/s44267-025-00078-x"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2024.3452982"},{"key":"e_1_3_2_1_4_1","volume-title":"Proceedings of the 2019 conference of the North American chapter of the association for computational linguistics: human language technologies","volume":"1","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. Bert: Pre-training of deep bidirectional transformers for language understanding. In Proceedings of the 2019 conference of the North American chapter of the association for computational linguistics: human language technologies, volume 1 (long and short papers). 4171--4186."},{"key":"e_1_3_2_1_5_1","volume-title":"Semantically self-aligned network for text-to-image part-aware person re-identification. arXiv preprint arXiv:2107.12666","author":"Ding Zefeng","year":"2021","unstructured":"Zefeng Ding, Changxing Ding, Zhiyin Shao, and Dacheng Tao. 2021. Semantically self-aligned network for text-to-image part-aware person re-identification. arXiv preprint arXiv:2107.12666 (2021)."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2023.3344354"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01474"},{"key":"e_1_3_2_1_8_1","first-page":"57","article-title":"Bootstrapping vision language learning with decoupled language pre-training","volume":"36","author":"Jian Yiren","year":"2023","unstructured":"Yiren Jian, Chongyang Gao, and Soroush Vosoughi. 2023. Bootstrapping vision language learning with decoupled language pre-training. Advances in Neural Information Processing Systems 36 (2023), 57--72.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00273"},{"key":"e_1_3_2_1_10_1","volume-title":"International conference on machine learning. PMLR, 12888--12900","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022. Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In International conference on machine learning. PMLR, 12888--12900."},{"key":"e_1_3_2_1_11_1","volume-title":"Align before fuse: Vision and language representation learning with momentum distillation. Advances in neural information processing systems 34","author":"Li Junnan","year":"2021","unstructured":"Junnan Li, Ramprasaath Selvaraju, Akhilesh Gotmare, Shafiq Joty, Caiming Xiong, and Steven Chu Hong Hoi. 2021. Align before fuse: Vision and language representation learning with momentum distillation. Advances in neural information processing systems 34 (2021), 9694--9705."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01794"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.551"},{"key":"e_1_3_2_1_14_1","volume-title":"Proceedings, Part XXX 16","author":"Li Xiujun","year":"2020","unstructured":"Xiujun Li, Xi Yin, Chunyuan Li, Pengchuan Zhang, Xiaowei Hu, Lei Zhang, Lijuan Wang, Houdong Hu, Li Dong, Furu Wei, et al. 2020. Oscar: Object-semantics aligned pre-training for vision-language tasks. In Computer Vision--ECCV 2020: 16th European Conference, Glasgow, UK, August 23--28, 2020, Proceedings, Part XXX 16. Springer, 121--137."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2024.3355644"},{"key":"e_1_3_2_1_16_1","volume-title":"European Conference on Computer Vision. Springer, 38--55","author":"Liu Shilong","year":"2024","unstructured":"Shilong Liu, Zhaoyang Zeng, Tianhe Ren, Feng Li, Hao Zhang, Jie Yang, Qing Jiang, Chunyuan Li, Jianwei Yang, Hang Su, et al. 2024. Grounding dino: Marrying dino with grounded pre-training for open-set object detection. In European Conference on Computer Vision. Springer, 38--55."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i12.29314"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611768"},{"key":"e_1_3_2_1_19_1","volume-title":"International conference on machine learning. PmLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PmLR, 8748--8763."},{"key":"e_1_3_2_1_20_1","volume-title":"Sentence-bert: Sentence embeddings using siamese bert-networks. arXiv preprint arXiv:1908.10084","author":"Reimers Nils","year":"2019","unstructured":"Nils Reimers and Iryna Gurevych. 2019. Sentence-bert: Sentence embeddings using siamese bert-networks. arXiv preprint arXiv:1908.10084 (2019)."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01026"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548028"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i5.28298"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19833-5_42"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01621"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01838"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681553"},{"key":"e_1_3_2_1_28_1","volume-title":"Vitaa: Visual textual attributes alignment in person search by natural language. In Computer vision--ECCV 2020: 16th European conference, glasgow, UK, August 23--28","author":"Wang Zhe","year":"2020","unstructured":"Zhe Wang, Zhiyuan Fang, Jun Wang, and Yezhou Yang. 2020. Vitaa: Visual textual attributes alignment in person search by natural language. In Computer vision--ECCV 2020: 16th European conference, glasgow, UK, August 23--28, 2020, proceedings, part XII 16. Springer, 402--420."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611832"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2023.3327924"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681165"},{"key":"e_1_3_2_1_32_1","volume-title":"Image-specific information suppression and implicit local alignment for text-based person search","author":"Yan Shuanglin","year":"2023","unstructured":"Shuanglin Yan, Hao Tang, Liyan Zhang, and Jinhui Tang. 2023. Image-specific information suppression and implicit local alignment for text-based person search. IEEE transactions on neural networks and learning systems (2023)."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01522"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611709"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2024.111247"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2024.3369699"},{"key":"e_1_3_2_1_37_1","volume-title":"Proceedings of the AAAI Conference on Artificial Intelligence","volume":"38","author":"Zhao Zhiwei","year":"2024","unstructured":"Zhiwei Zhao, Bin Liu, Yan Lu, Qi Chu, and Nenghai Yu. 2024. Unifying multimodal uncertainty modeling and semantic alignment for text-to-image person re-identification. In Proceedings of the AAAI Conference on Artificial Intelligence, Vol. 38. 7534--7542."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475369"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2021.3067997"},{"key":"e_1_3_2_1_40_1","first-page":"45666","article-title":"Plip: Language-image pre-training for person representation learning","volume":"37","author":"Zuo Jialong","year":"2024","unstructured":"Jialong Zuo, Jiahao Hong, Feng Zhang, Changqian Yu, Hanyu Zhou, Changxin Gao, Nong Sang, and Jingdong Wang. 2024. Plip: Language-image pre-training for person representation learning. Advances in Neural Information Processing Systems 37 (2024), 45666--45702.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02078"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3754721","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T05:06:58Z","timestamp":1765343218000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3754721"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":41,"alternative-id":["10.1145\/3746027.3754721","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3754721","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}