{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,15]],"date-time":"2025-11-15T10:36:30Z","timestamp":1763202990331,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":32,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100010023","name":"Natural Science Research of Jiangsu Higher Education Institutions of China","doi-asserted-by":"publisher","award":["21KJB520008"],"award-info":[{"award-number":["21KJB520008"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100010023","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["Grant No. 62101245"],"award-info":[{"award-number":["Grant No. 62101245"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681715","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:27Z","timestamp":1729925967000},"page":"10105-10113","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["TVPR: Text-to-Video Person Retrieval and a New Benchmark"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-0108-1732","authenticated-orcid":false,"given":"Xu","family":"Zhang","sequence":"first","affiliation":[{"name":"Nanjing Tech University, Nanjing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0333-7878","authenticated-orcid":false,"given":"Fan","family":"Ni","sequence":"additional","affiliation":[{"name":"Nanjing Tech University, Nanjing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1919-3258","authenticated-orcid":false,"given":"Guan-Nan","family":"Dong","sequence":"additional","affiliation":[{"name":"Nanjing Tech University, Nanjing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6972-5534","authenticated-orcid":false,"given":"Aichun","family":"Zhu","sequence":"additional","affiliation":[{"name":"Nanjing Tech University, Nanjing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-3252-9092","authenticated-orcid":false,"given":"Jianhui","family":"Wu","sequence":"additional","affiliation":[{"name":"Nanjing Tech University, Nanjing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-1805-8733","authenticated-orcid":false,"given":"Mingcheng","family":"Ni","sequence":"additional","affiliation":[{"name":"Nanjing Tech University, Nanjing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7158-913X","authenticated-orcid":false,"given":"Hui","family":"Liu","sequence":"additional","affiliation":[{"name":"Nanjing Tech University, Nanjing, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Rasa: Relation and sensitivity aware representation learning for text-based person search. arXiv preprint arXiv:2305.13653","author":"Bai Yang","year":"2023","unstructured":"Yang Bai, Min Cao, Daming Gao, Ziqiang Cao, Chen Chen, Zhenfeng Fan, Liqiang Nie, and Min Zhang. 2023. Rasa: Relation and sensitivity aware representation learning for text-based person search. arXiv preprint arXiv:2305.13653 (2023)."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_2_1","DOI":"10.1109\/ICCV48922.2021.00175"},{"unstructured":"Gary Bradski Adrian Kaehler et al. 2000. OpenCV. Dr. Dobb?s journal of software tools Vol. 3 2 (2000).","key":"e_1_3_2_1_3_1"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_4_1","DOI":"10.1609\/aaai.v38i1.27801"},{"key":"e_1_3_2_1_5_1","volume-title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. CoRR","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. CoRR, Vol. abs\/1810.04805 (2018). [arXiv]1810.04805 http:\/\/arxiv.org\/abs\/1810.04805"},{"unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly et al. 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020).","key":"e_1_3_2_1_6_1"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_7_1","DOI":"10.1007\/978-3-030-58548-8_13"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_8_1","DOI":"10.1109\/CVPR52688.2022.00495"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_9_1","DOI":"10.1109\/CVPR.2016.90"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_10_1","DOI":"10.1007\/978-3-642-21227-7_9"},{"key":"e_1_3_2_1_11_1","volume-title":"Long short-term memory. Neural computation","author":"Hochreiter Sepp","year":"1997","unstructured":"Sepp Hochreiter and J\u00fcrgen Schmidhuber. 1997. Long short-term memory. Neural computation, Vol. 9, 8 (1997), 1735--1780."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_12_1","DOI":"10.1109\/CVPR52729.2023.00273"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_13_1","DOI":"10.1109\/ICCV51070.2023.01826"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_14_1","DOI":"10.1109\/CVPR.2017.551"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_15_1","DOI":"10.3115\/1118108.1118117"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_16_1","DOI":"10.1016\/j.neucom.2022.07.028"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_17_1","DOI":"10.1145\/3503161.3547910"},{"key":"e_1_3_2_1_18_1","volume-title":"Proceedings of the International conference on machine learning. 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In Proceedings of the International conference on machine learning. 8748--8763."},{"key":"e_1_3_2_1_19_1","volume-title":"Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556","author":"Simonyan Karen","year":"2014","unstructured":"Karen Simonyan and Andrew Zisserman. 2014. Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556 (2014)."},{"key":"e_1_3_2_1_20_1","volume-title":"Attention is all you need. Advances in neural information processing systems","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems, Vol. 30 (2017)."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_21_1","DOI":"10.24963\/ijcai.2021\/148"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_22_1","DOI":"10.1007\/978-3-319-10593-2_45"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_23_1","DOI":"10.1145\/3503161.3548166"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_24_1","DOI":"10.1117\/1.JEI.29.4.043028"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_25_1","DOI":"10.1109\/CVPR.2018.00543"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_26_1","DOI":"10.1109\/ICCV48922.2021.00165"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_27_1","DOI":"10.1007\/978-3-030-01267-0_19"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_28_1","DOI":"10.1109\/CVPR.2016.571"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_29_1","DOI":"10.1109\/TIP.2023.3327924"},{"key":"e_1_3_2_1_30_1","volume-title":"Classification is a strong baseline for deep metric learning. arXiv preprint arXiv:1811.12649","author":"Zhai Andrew","year":"2018","unstructured":"Andrew Zhai and Hao-Yu Wu. 2018. Classification is a strong baseline for deep metric learning. arXiv preprint arXiv:1811.12649 (2018)."},{"key":"e_1_3_2_1_31_1","volume-title":"Spatiotemporal transformer for video-based person re-identification. arXiv preprint arXiv:2103.16469","author":"Zhang Tianyu","year":"2021","unstructured":"Tianyu Zhang, Longhui Wei, Lingxi Xie, Zijie Zhuang, Yongfei Zhang, Bo Li, and Qi Tian. 2021. Spatiotemporal transformer for video-based person re-identification. arXiv preprint arXiv:2103.16469 (2021)."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_32_1","DOI":"10.1145\/3474085.3475369"}],"event":{"sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"acronym":"MM '24","name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia"},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681715","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681715","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:28Z","timestamp":1750295848000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681715"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":32,"alternative-id":["10.1145\/3664647.3681715","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681715","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}