{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,28]],"date-time":"2026-01-28T20:54:39Z","timestamp":1769633679688,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":60,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"National Science Foundation of China","award":["62002252, 62106165"],"award-info":[{"award-number":["62002252, 62106165"]}]},{"name":"Key Laboratory of Artificial Intelligence, Ministry of Education, P.R. China"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3612285","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:27:12Z","timestamp":1698391632000},"page":"757-767","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":20,"title":["Text-based Person Search without Parallel Image-Text Data"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-6845-5963","authenticated-orcid":false,"given":"Yang","family":"Bai","sequence":"first","affiliation":[{"name":"Soochow University, Suzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-0367-5120","authenticated-orcid":false,"given":"Jingyao","family":"Wang","sequence":"additional","affiliation":[{"name":"Soochow University, Suzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6628-0370","authenticated-orcid":false,"given":"Min","family":"Cao","sequence":"additional","affiliation":[{"name":"Soochow University, Suzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8297-6549","authenticated-orcid":false,"given":"Chen","family":"Chen","sequence":"additional","affiliation":[{"name":"Institute of Automation, Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1077-9033","authenticated-orcid":false,"given":"Ziqiang","family":"Cao","sequence":"additional","affiliation":[{"name":"Soochow University, Suzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1476-0273","authenticated-orcid":false,"given":"Liqiang","family":"Nie","sequence":"additional","affiliation":[{"name":"Harbin Institute of Technology, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3895-5510","authenticated-orcid":false,"given":"Min","family":"Zhang","sequence":"additional","affiliation":[{"name":"Soochow University, Suzhou, China"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00636"},{"key":"e_1_3_2_1_2_1","volume-title":"RaSa: Relation and Sensitivity Aware Representation Learning for Text-based Person Search. arXiv preprint arXiv:2305.13653","author":"Bai Yang","year":"2023","unstructured":"Yang Bai, Min Cao, Daming Gao, Ziqiang Cao, Chen Chen, Zhenfeng Fan, Liqiang Nie, and Min Zhang. 2023. RaSa: Relation and Sensitivity Aware Representation Learning for Text-based Person Search. arXiv preprint arXiv:2305.13653 (2023)."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2021.3060948"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2020.2994524"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2022\/759"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.emnlp-main.742"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01270-0_4"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2022.04.081"},{"key":"e_1_3_2_1_9_1","volume-title":"Semantically self-aligned network for text-to-image part-aware person re-identification. arXiv preprint arXiv:2107.12666","author":"Ding Zefeng","year":"2021","unstructured":"Zefeng Ding, Changxing Ding, Zhiyin Shao, and Dacheng Tao. 2021. Semantically self-aligned network for text-to-image part-aware person re-identification. arXiv preprint arXiv:2107.12666 (2021)."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2020.107573"},{"key":"e_1_3_2_1_11_1","volume-title":"A convolutional baseline for person re-identification using vision and language descriptions. arXiv preprint arXiv:2003.00808","author":"Farooq Ammarah","year":"2020","unstructured":"Ammarah Farooq, Muhammad Awais, Fei Yan, Josef Kittler, Ali Akbari, and Syed Safwan Khalid. 2020. A convolutional baseline for person re-identification using vision and language descriptions. arXiv preprint arXiv:2003.00808 (2020)."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00425"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2022.3205216"},{"key":"e_1_3_2_1_14_1","volume-title":"Contextual non-local alignment over full-scale representation for text-based person search. arXiv preprint arXiv:2101.03036","author":"Gao Chenyang","year":"2021","unstructured":"Chenyang Gao, Guanyu Cai, Xinyang Jiang, Feng Zheng, Jun Zhang, Yifei Gong, Pai Peng, Xiaowei Guo, and Xing Sun. 2021. Contextual non-local alignment over full-scale representation for text-based person search. arXiv preprint arXiv:2101.03036 (2021)."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.01042"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.5555\/3491440.3491568"},{"key":"e_1_3_2_1_17_1","volume-title":"Text-based person search with limited data. arXiv preprint arXiv:2110.10807","author":"Han Xiao","year":"2021","unstructured":"Xiao Han, Sen He, Li Zhang, and Tao Xiang. 2021. Text-based person search with limited data. arXiv preprint arXiv:2110.10807 (2021)."},{"key":"e_1_3_2_1_18_1","volume-title":"Lin Yuanbo Wu, and Ye Zhao","author":"Ji Zhong","year":"2022","unstructured":"Zhong Ji, Junhua Hu, Deyin Liu, Lin Yuanbo Wu, and Ye Zhao. 2022. Asymmetric Cross-Scale Alignment for Text-Based Person Search. IEEE Transactions on Multimedia (2022)."},{"key":"e_1_3_2_1_19_1","volume-title":"Cross-Modal Implicit Relation Reasoning and Aligning for Text-to-Image Person Retrieval. In IEEE International Conference on Computer Vision and Pattern Recognition (CVPR).","author":"Jiang Ding","year":"2023","unstructured":"Ding Jiang and Mang Ye. 2023. Cross-Modal Implicit Relation Reasoning and Aligning for Text-to-Image Person Retrieval. In IEEE International Conference on Computer Vision and Pattern Recognition (CVPR)."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6777"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01069"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1110"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00751"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2019.2898940"},{"key":"e_1_3_2_1_25_1","volume-title":"International Conference on Machine Learning. PMLR, 12888--12900","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022b. Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In International Conference on Machine Learning. PMLR, 12888--12900."},{"key":"e_1_3_2_1_26_1","volume-title":"Align before fuse: Vision and language representation learning with momentum distillation. Advances in neural information processing systems","author":"Li Junnan","year":"2021","unstructured":"Junnan Li, Ramprasaath Selvaraju, Akhilesh Gotmare, Shafiq Joty, Caiming Xiong, and Steven Chu Hong Hoi. 2021a. Align before fuse: Vision and language representation learning with momentum distillation. Advances in neural information processing systems, Vol. 34 (2021), 9694--9705."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.naacl-main.420"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746846"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.209"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.551"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDM.2019.00054"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.3115\/1118108.1118117"},{"key":"e_1_3_2_1_33_1","volume-title":"International conference on machine learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748--8763."},{"key":"e_1_3_2_1_34_1","unstructured":"Alec Radford Jeffrey Wu Rewon Child David Luan Dario Amodei Ilya Sutskever et al. 2019. Language models are unsupervised multitask learners. OpenAI blog Vol. 1 8 (2019) 9."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.5555\/3455716.3455856"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.13"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548028"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-25072-9_42"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547876"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2023.3272169"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2023.3251025"},{"key":"e_1_3_2_1_42_1","volume-title":"Tel Aviv","author":"Suo Wei","year":"2022","unstructured":"Wei Suo, Mengyang Sun, Kai Niu, Yiqi Gao, Peng Wang, Yanning Zhang, and Qi Wu. 2022. A Simple and Robust Correlation Filtering Method for Text-Based Person Search. In Computer Vision-ECCV 2022: 17th European Conference, Tel Aviv, Israel, October 23-27, 2022, Proceedings, Part XXXV. Springer, 726--742."},{"key":"e_1_3_2_1_43_1","volume-title":"Attention is all you need. Advances in neural information processing systems","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems, Vol. 30 (2017)."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"e_1_3_2_1_45_1","volume-title":"International Conference on Machine Learning. PMLR, 23318--23340","author":"Wang Peng","year":"2022","unstructured":"Peng Wang, An Yang, Rui Men, Junyang Lin, Shuai Bai, Zhikang Li, Jianxin Ma, Chang Zhou, Jingren Zhou, and Hongxia Yang. 2022b. Ofa: Unifying architectures, tasks, and modalities through a simple sequence-to-sequence learning framework. In International Conference on Machine Learning. PMLR, 23318--23340."},{"key":"e_1_3_2_1_46_1","volume-title":"International Conference on Machine Learning. PMLR, 22680--22690","author":"Wang Teng","year":"2022","unstructured":"Teng Wang, Wenhao Jiang, Zhichao Lu, Feng Zheng, Ran Cheng, Chengguo Yin, and Ping Luo. 2022a. Vlmixer: Unpaired vision-language pre-training via cross-modal cutmix. In International Conference on Machine Learning. PMLR, 22680--22690."},{"key":"e_1_3_2_1_47_1","volume-title":"Vitaa: Visual-textual attributes alignment in person search by natural language. In Computer Vision-ECCV 2020: 16th European Conference","author":"Wang Zhe","year":"2020","unstructured":"Zhe Wang, Zhiyuan Fang, Jun Wang, and Yezhou Yang. 2020. Vitaa: Visual-textual attributes alignment in person search by natural language. In Computer Vision-ECCV 2020: 16th European Conference, Glasgow, UK, August 23-28, 2020, Proceedings, Part XII 16. Springer, 402--420."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548057"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548166"},{"key":"e_1_3_2_1_50_1","volume-title":"Calibrating Cross-modal Feature for Text-Based Person Searching. arXiv preprint arXiv:2304.02278","author":"Wei Donglai","year":"2023","unstructured":"Donglai Wei, Sipeng Zhang, Tong Yang, and Jing Liu. 2023. Calibrating Cross-modal Feature for Text-Based Person Searching. arXiv preprint arXiv:2304.02278 (2023)."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00165"},{"key":"e_1_3_2_1_52_1","volume-title":"CLIP-Driven Fine-grained Text-Image Person Re-identification. arXiv preprint arXiv:2210.10276","author":"Yan Shuanglin","year":"2022","unstructured":"Shuanglin Yan, Neng Dong, Liyan Zhang, and Jinhui Tang. 2022. CLIP-Driven Fine-grained Text-Image Person Re-identification. arXiv preprint arXiv:2210.10276 (2022)."},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1145\/2733373.2806326"},{"key":"e_1_3_2_1_54_1","volume-title":"Deep learning for person re-identification: A survey and outlook","author":"Ye Mang","year":"2021","unstructured":"Mang Ye, Jianbing Shen, Gaojie Lin, Tao Xiang, Ling Shao, and Steven CH Hoi. 2021. Deep learning for person re-identification: A survey and outlook. IEEE transactions on pattern analysis and machine intelligence, Vol. 44, 6 (2021), 2872--2893."},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01246-5_42"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01120"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413864"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1145\/3383184"},{"key":"e_1_3_2_1_59_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 16485--16494","author":"Zhou Mingyang","year":"2022","unstructured":"Mingyang Zhou, Licheng Yu, Amanpreet Singh, Mengjiao Wang, Zhou Yu, and Ning Zhang. 2022. Unsupervised vision-and-language pre-training via retrieval-based multi-granular alignment. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 16485--16494."},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475369"}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","location":"Ottawa ON Canada","acronym":"MM '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612285","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3612285","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:09:00Z","timestamp":1755821340000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612285"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":60,"alternative-id":["10.1145\/3581783.3612285","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3612285","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}