{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T01:08:46Z","timestamp":1782868126553,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":52,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3611916","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:27:40Z","timestamp":1698391660000},"page":"7737-7746","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":30,"title":["A Baseline Investigation: Transformer-based Cross-view Baseline for Text-based Person Search"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-8421-7167","authenticated-orcid":false,"given":"Xianghao","family":"Zang","sequence":"first","affiliation":[{"name":"China Telecom Corporation Ltd. Data&amp;AI Technology Company, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7429-5495","authenticated-orcid":false,"given":"Wei","family":"Gao","sequence":"additional","affiliation":[{"name":"Shenzhen Graduate School, Peking University, Shenzhen, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4079-3968","authenticated-orcid":false,"given":"Ge","family":"Li","sequence":"additional","affiliation":[{"name":"Shenzhen Graduate School, Peking University, Shenzhen, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4379-2971","authenticated-orcid":false,"given":"Han","family":"Fang","sequence":"additional","affiliation":[{"name":"China Telecom Corporation Ltd. Data&amp;AI Technology Company, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-8114-103X","authenticated-orcid":false,"given":"Chao","family":"Ban","sequence":"additional","affiliation":[{"name":"China Telecom Corporation Ltd. Data&amp;AI Technology Company, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-1835-9271","authenticated-orcid":false,"given":"Zhongjiang","family":"He","sequence":"additional","affiliation":[{"name":"China Telecom Corporation Ltd. Data&amp;AI Technology Company, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-7917-1628","authenticated-orcid":false,"given":"Hao","family":"Sun","sequence":"additional","affiliation":[{"name":"China Telecom Corporation Ltd. Data&amp;AI Technology Company, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01270-0_4"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2021.3068825"},{"key":"e_1_3_2_1_3_1","volume-title":"TIPCB: A Simple but Effective Part-based Convolutional Baseline for Text-based Person Search. arXiv preprint arXiv:2105.11628","author":"Chen Yuhao","year":"2021","unstructured":"Yuhao Chen, Guoqing Zhang, Yujiang Lu, Zhenxing Wang, Yuhui Zheng, and Ruili Wang. 2021b. TIPCB: A Simple but Effective Part-based Convolutional Baseline for Text-based Person Search. arXiv preprint arXiv:2105.11628 (2021)."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58577-8_7"},{"key":"e_1_3_2_1_5_1","volume-title":"Semantically Self-Aligned Network for Text-to-Image Part-aware Person Re-identification. arXiv preprint arXiv:2107.12666","author":"Ding Zefeng","year":"2021","unstructured":"Zefeng Ding, Changxing Ding, Zhiyin Shao, and Dacheng Tao. 2021. Semantically Self-Aligned Network for Text-to-Image Part-aware Person Re-identification. arXiv preprint arXiv:2107.12666 (2021)."},{"key":"e_1_3_2_1_6_1","volume-title":"Proceedings of the International Conference on Learning Representations. 1--22","author":"Dosovitskiy Alexey","year":"2020","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, and Sylvain Gelly. 2020. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. In Proceedings of the International Conference on Learning Representations. 1--22."},{"key":"e_1_3_2_1_7_1","volume-title":"An Empirical Study of Training End-to-End Vision-and-Language Transformers. arXiv preprint arXiv:2111.02387","author":"Dou Zi-Yi","year":"2021","unstructured":"Zi-Yi Dou, Yichong Xu, Zhe Gan, Jianfeng Wang, Shuohang Wang, Lijuan Wang, Chenguang Zhu, Zicheng Liu, and Michael Zeng. 2021. An Empirical Study of Training End-to-End Vision-and-Language Transformers. arXiv preprint arXiv:2111.02387 (2021)."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i4.20370"},{"key":"e_1_3_2_1_9_1","volume-title":"A Convolutional Baseline for Person Re-Identification Using Vision and Language Descriptions. arXiv preprint arXiv:2003.00808","author":"Farooq Ammarah","year":"2020","unstructured":"Ammarah Farooq, Muhammad Awais, Fei Yan, Josef Kittler, Ali Akbari, and Syed Safwan Khalid. 2020. A Convolutional Baseline for Person Re-Identification Using Vision and Language Descriptions. arXiv preprint arXiv:2003.00808 (2020)."},{"key":"e_1_3_2_1_10_1","volume-title":"Proceedings of the IEEE International Workshop on Performance Evaluation for Tracking and Surveillance","volume":"3","author":"Gray Douglas","year":"2007","unstructured":"Douglas Gray, Shane Brennan, and Hai Tao. 2007. Evaluating appearance models for recognition, reacquisition, and tracking. In Proceedings of the IEEE International Workshop on Performance Evaluation for Tracking and Surveillance, Vol. 3. Citeseer, 1--7."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.5244\/C.35.10"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_13_1","volume-title":"Pixel-bert: Aligning image pixels with text by deep multi-modal transformers. arXiv preprint arXiv:2004.00849","author":"Huang Zhicheng","year":"2020","unstructured":"Zhicheng Huang, Zhaoyang Zeng, Bei Liu, Dongmei Fu, and Jianlong Fu. 2020. Pixel-bert: Aligning image pixels with text by deep multi-modal transformers. arXiv preprint arXiv:2004.00849 (2020)."},{"key":"e_1_3_2_1_14_1","volume-title":"Lin Yuanbo Wu, and Ye Zhao","author":"Ji Zhong","year":"2022","unstructured":"Zhong Ji, Junhua Hu, Deyin Liu, Lin Yuanbo Wu, and Ye Zhao. 2022. Asymmetric Cross-Scale Alignment for Text-Based Person Search. IEEE Transactions on Multimedia (2022)."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6777"},{"key":"e_1_3_2_1_16_1","volume-title":"Proceedings of the International Conference on Machine Learning. 5583--5594","author":"Kim Wonjae","year":"2021","unstructured":"Wonjae Kim, Bokyung Son, and Ildoo Kim. 2021. Vilt: Vision-and-language transformer without convolution or region supervision. In Proceedings of the International Conference on Machine Learning. 5583--5594."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-016-0981-7"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01225-0_13"},{"key":"e_1_3_2_1_19_1","volume-title":"Proceedings of the Advances in Neural Information Processing Systems. 1--12","author":"Li Junnan","year":"2021","unstructured":"Junnan Li, Ramprasaath Selvaraju, Akhilesh Gotmare, Shafiq Joty, Caiming Xiong, and Steven Chu Hong Hoi. 2021. Align before fuse: Vision and language representation learning with momentum distillation. In Proceedings of the Advances in Neural Information Processing Systems. 1--12."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746846"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.551"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3350991"},{"key":"e_1_3_2_1_24_1","volume-title":"Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692","author":"Liu Yinhan","year":"2019","unstructured":"Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, and Veselin Stoyanov. 2019a. Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692 (2019)."},{"key":"e_1_3_2_1_25_1","volume-title":"Proceedings of the International Conference on Learning Representations. 1--16","author":"Loshchilov Ilya","year":"2017","unstructured":"Ilya Loshchilov and Frank Hutter. 2017. Decoupled weight decay regularization. In Proceedings of the International Conference on Learning Representations. 1--16."},{"key":"e_1_3_2_1_26_1","volume-title":"Proceedings of the Advances in Neural Information Processing Systems. 1--11","author":"Lu Jiasen","year":"2019","unstructured":"Jiasen Lu, Dhruv Batra, Devi Parikh, and Stefan Lee. 2019. Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. In Proceedings of the Advances in Neural Information Processing Systems. 1--11."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.imavis.2021.104168"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547753"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2020.2984883"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413895"},{"key":"e_1_3_2_1_31_1","first-page":"1143","article-title":"Im2text: Describing images using 1 million captioned photographs","volume":"24","author":"Ordonez Vicente","year":"2011","unstructured":"Vicente Ordonez, Girish Kulkarni, and Tamara Berg. 2011. Im2text: Describing images using 1 million captioned photographs. In Proceedings of the Advances in Neural Information Processing Systems, Vol. 24. 1143--1151.","journal-title":"Proceedings of the Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_32_1","volume-title":"Pytorch: An imperative style, high-performance deep learning library. arXiv preprint arXiv:1912.01703","author":"Paszke Adam","year":"2019","unstructured":"Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, and Luca Antiga. 2019. Pytorch: An imperative style, high-performance deep learning library. arXiv preprint arXiv:1912.01703 (2019)."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00591"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548028"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1238"},{"key":"e_1_3_2_1_36_1","volume-title":"Hao Tan, Mohit Bansal, Anna Rohrbach, Kai-Wei Chang, Zhewei Yao, and Kurt Keutzer.","author":"Shen Sheng","year":"2021","unstructured":"Sheng Shen, Liunian Harold Li, Hao Tan, Mohit Bansal, Anna Rohrbach, Kai-Wei Chang, Zhewei Yao, and Kurt Keutzer. 2021. How Much Can CLIP Benefit Vision-and-Language Tasks? arXiv preprint arXiv:2107.06383 (2021)."},{"key":"e_1_3_2_1_37_1","volume-title":"Exploiting the Textual Potential from Vision-Language Pre-training for Text-based Person Search. arXiv preprint arXiv:2303.04497","author":"Wang Guanshuo","year":"2023","unstructured":"Guanshuo Wang, Fufu Yu, Junjie Li, Qiong Jia, and Shouhong Ding. 2023. Exploiting the Textual Potential from Vision-Language Pre-training for Text-based Person Search. arXiv preprint arXiv:2303.04497 (2023)."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58610-2_24"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548057"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548166"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00016"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00165"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/TII.2022.3151766"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.imavis.2021.104330"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1049\/ipr2.12380"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2020.2972168"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01246-5_42"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01120"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413864"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.5555\/2919332.2919877"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1145\/3383184"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475369"}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","location":"Ottawa ON Canada","acronym":"MM '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3611916","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3611916","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:12:20Z","timestamp":1755821540000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3611916"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":52,"alternative-id":["10.1145\/3581783.3611916","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3611916","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}