{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,1]],"date-time":"2025-12-01T11:30:29Z","timestamp":1764588629848,"version":"build-2065373602"},"publisher-location":"New York, NY, USA","reference-count":39,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,5,8]],"date-time":"2025-05-08T00:00:00Z","timestamp":1746662400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/501100006374","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["Grants No. U23B2031, 62302142"],"award-info":[{"award-number":["Grants No. U23B2031, 62302142"]}],"id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Anhui Provincial Natural Science Foundation","award":["Grants No. 2408085QF191, 2408085MF159"],"award-info":[{"award-number":["Grants No. 2408085QF191, 2408085MF159"]}]},{"DOI":"10.13039\/501100006374","name":"Fundamental Research Funds for the Central Universities","doi-asserted-by":"publisher","award":["Grants No. JZ2024HGTA0178, JZ2023HGQA0097, JZ2023HGQA0472"],"award-info":[{"award-number":["Grants No. JZ2024HGTA0178, JZ2023HGQA0097, JZ2023HGQA0472"]}],"id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Major Project of Anhui Province","award":["Grant No. 202423k09020001"],"award-info":[{"award-number":["Grant No. 202423k09020001"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,5,8]]},"DOI":"10.1145\/3701716.3717656","type":"proceedings-article","created":{"date-parts":[[2025,5,23]],"date-time":"2025-05-23T16:20:01Z","timestamp":1748017201000},"page":"1568-1572","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["Efficient Vision Language Model Fine-tuning for Text-based Person Anomaly Search"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-7373-0423","authenticated-orcid":false,"given":"Jiayi","family":"He","sequence":"first","affiliation":[{"name":"School of Computer Science and Information Engineering, Hefei University of Technology, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6313-2543","authenticated-orcid":false,"given":"Shengeng","family":"Tang","sequence":"additional","affiliation":[{"name":"School of Computer Science and Information Engineering, Hefei University of Technology, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-7718-5430","authenticated-orcid":false,"given":"Ao","family":"Liu","sequence":"additional","affiliation":[{"name":"School of Computer Science and Information Engineering, Hefei University of Technology, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7546-9052","authenticated-orcid":false,"given":"Lechao","family":"Cheng","sequence":"additional","affiliation":[{"name":"School of Computer Science and Information Engineering, Hefei University of Technology, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3818-4277","authenticated-orcid":false,"given":"Jingjing","family":"Wu","sequence":"additional","affiliation":[{"name":"School of Computer Science and Information Engineering, Hefei University of Technology, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8818-6740","authenticated-orcid":false,"given":"Yanyan","family":"Wei","sequence":"additional","affiliation":[{"name":"School of Computer Science and Information Engineering, Hefei University of Technology, Hefei, China"}]}],"member":"320","published-online":{"date-parts":[[2025,5,23]]},"reference":[{"key":"e_1_3_2_2_1_1","volume-title":"Fahad Shahbaz Khan, and Mubarak Shah.","author":"Acsintoae Andra","year":"2022","unstructured":"Andra Acsintoae, Andrei Florescu, Mariana-Iuliana Georgescu, Tudor Mare, Paul Sumedrea, Radu Tudor Ionescu, Fahad Shahbaz Khan, and Mubarak Shah. 2022. Ubnormal: New Benchmark for Supervised Open-Set Video Anomaly Detection. In Computer Vision and Pattern Recognition. 20143--20153."},{"key":"e_1_3_2_2_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612285"},{"key":"e_1_3_2_2_3_1","volume-title":"An Empirical Study of Clip for Text-based Person Search. In AAAI Conference on Artificial Intelligence. 465--473","author":"Cao Min","year":"2024","unstructured":"Min Cao, Yang Bai, Ziyin Zeng, Mang Ye, and Min Zhang. 2024. An Empirical Study of Clip for Text-based Person Search. In AAAI Conference on Artificial Intelligence. 465--473."},{"key":"e_1_3_2_2_4_1","volume-title":"Vlp: A Survey on Vision-Language Pre-training. Machine Intelligence Research","author":"Chen Fei-Long","year":"2023","unstructured":"Fei-Long Chen, Du-Zhen Zhang, Ming-Lun Han, Xiu-Yi Chen, Jing Shi, Shuang Xu, and Bo Xu. 2023. Vlp: A Survey on Vision-Language Pre-training. Machine Intelligence Research (2023), 38--56."},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"crossref","unstructured":"Zi-Yi Dou Yichong Xu Zhe Gan Jianfeng Wang Shuohang Wang Lijuan Wang Chenguang Zhu Pengchuan Zhang Lu Yuan Nanyun Peng et al. 2022. An Empirical Study of Training End-to-End Vision-and-Language Transformers. In Computer Vision and Pattern Recognition. 18166--18176.","DOI":"10.1109\/CVPR52688.2022.01763"},{"key":"e_1_3_2_2_6_1","volume-title":"Convolutional Transformer Based Dual Discriminator Generative Adversarial Networks for Video Anomaly Detection. In ACM International Conference on Multimedia. 5546--5554","author":"Feng Xinyang","year":"2021","unstructured":"Xinyang Feng, Dongjin Song, Yuncong Chen, Zhengzhang Chen, Jingchao Ni, and Haifeng Chen. 2021. Convolutional Transformer Based Dual Discriminator Generative Adversarial Networks for Video Anomaly Detection. In ACM International Conference on Multimedia. 5546--5554."},{"key":"e_1_3_2_2_7_1","volume-title":"Multimodal Motion Conditioned Diffusion Model for Skeleton-Based Video Anomaly Detection. In International Conference on Computer Vision. 10318--10329","author":"Flaborea Alessandro","year":"2023","unstructured":"Alessandro Flaborea, Luca Collorone, Guido Maria D'Amely Di Melendugno, Stefano D'Arrigo, Bardh Prenkaj, and Fabio Galasso. 2023. Multimodal Motion Conditioned Diffusion Model for Skeleton-Based Video Anomaly Detection. In International Conference on Computer Vision. 10318--10329."},{"key":"e_1_3_2_2_8_1","volume-title":"Normalizing Flows for Human Pose Anomaly Detection. In International Conference on Computer Vision. 13545--13554","author":"Hirschorn Or","year":"2023","unstructured":"Or Hirschorn and Shai Avidan. 2023. Normalizing Flows for Human Pose Anomaly Detection. In International Conference on Computer Vision. 13545--13554."},{"key":"e_1_3_2_2_9_1","unstructured":"Weiquan Huang Aoqi Wu Yifan Yang Xufang Luo Yuqing Yang Liang Hu Qi Dai Xiyang Dai Dongdong Chen Chong Luo and Lili Qiu. 2024. LLM2CLIP: Powerful Language Model Unlock Richer Visual Representation. arXiv:2411.04997 [cs.CV] https:\/\/arxiv.org\/abs\/2411.04997"},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"crossref","unstructured":"Huaizu Jiang Ishan Misra Marcus Rohrbach Erik Learned-Miller and Xinlei Chen. 2020. In Defense of Grid Features for Visual Question Answering. In Computer Vision and Pattern Recognition. 10267--10276.","DOI":"10.1109\/CVPR42600.2020.01028"},{"key":"e_1_3_2_2_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3626772.3657823"},{"key":"e_1_3_2_2_12_1","volume-title":"Wortman Vaughan (Eds.)","volume":"34","author":"Li Junnan","year":"2021","unstructured":"Junnan Li, Ramprasaath Selvaraju, Akhilesh Gotmare, Shafiq Joty, Caiming Xiong, and Steven Chu Hong Hoi. 2021. Align before Fuse: Vision and Language Representation Learning with Momentum Distillation. In Neural Information Processing Systems, M. Ranzato, A. Beygelzimer, Y. Dauphin, P.S. Liang, and J. Wortman Vaughan (Eds.), Vol. 34. 9694--9705."},{"key":"e_1_3_2_2_13_1","unstructured":"Junnan Li Ramprasaath Selvaraju Akhilesh Gotmare Shafiq Joty Caiming Xiong and Steven Chu Hong Hoi. 2021. Align before Fuse: Vision and Language Representation Learning with Momentum Distillation. In Neural Information Processing Systems. 9694--9705."},{"key":"e_1_3_2_2_14_1","volume-title":"Hero: Hierarchical Encoder for Video Language Omni-Representation Pre-training. arXiv preprint arXiv:2005.00200","author":"Li Linjie","year":"2020","unstructured":"Linjie Li, Yen-Chun Chen, Yu Cheng, Zhe Gan, Licheng Yu, and Jingjing Liu. 2020. Hero: Hierarchical Encoder for Video Language Omni-Representation Pre-training. arXiv preprint arXiv:2005.00200 (2020)."},{"key":"e_1_3_2_2_15_1","unstructured":"Shuang Li Tong Xiao Hongsheng Li Bolei Zhou Dayu Yue and XiaogangWang. 2017. Person Search with Natural Language Description. In Computer Vision and Pattern Recognition. 1970--1979."},{"key":"e_1_3_2_2_16_1","volume-title":"Oscar: Object-Semantics Aligned Pre-training for Vision-Language Tasks. In European Conference on Computer Vision. 121--137","author":"Li Xiujun","year":"2020","unstructured":"Xiujun Li, Xi Yin, Chunyuan Li, Pengchuan Zhang, Xiaowei Hu, Lei Zhang, Lijuan Wang, Houdong Hu, Li Dong, Furu Wei, Yejin Choi, and Jianfeng Gao. 2020. Oscar: Object-Semantics Aligned Pre-training for Vision-Language Tasks. In European Conference on Computer Vision. 121--137."},{"key":"e_1_3_2_2_17_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2023.103623"},{"key":"e_1_3_2_2_18_1","volume-title":"Modality Alignment Meets Federated Broadcasting. arXiv preprint arXiv:2411.15837","author":"Ma Yuting","year":"2024","unstructured":"Yuting Ma, Shengeng Tang, Xiaohua Xu, and Lechao Cheng. 2024. Modality Alignment Meets Federated Broadcasting. arXiv preprint arXiv:2411.15837 (2024)."},{"key":"e_1_3_2_2_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2020.2984883"},{"key":"e_1_3_2_2_20_1","volume-title":"PLOT: Textbased Person Search with Part Slot Attention for Corresponding Part Discovery. In European Conference on Computer Vision. 474--490","author":"Park Jicheol","year":"2025","unstructured":"Jicheol Park, Dongwon Kim, Boseung Jeong, and Suha Kwak. 2025. PLOT: Textbased Person Search with Part Slot Attention for Corresponding Part Discovery. In European Conference on Computer Vision. 474--490."},{"key":"e_1_3_2_2_21_1","volume-title":"Pedestrian-specific Bipartite-aware Similarity Learning for Text-based Person Retrieval. In ACM International Conference on Multimedia. 8922--8931","author":"Shen Fei","year":"2023","unstructured":"Fei Shen, Xiangbo Shu, Xiaoyu Du, and Jinhui Tang. 2023. Pedestrian-specific Bipartite-aware Similarity Learning for Text-based Person Retrieval. In ACM International Conference on Multimedia. 8922--8931."},{"key":"e_1_3_2_2_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2024.3359045"},{"key":"e_1_3_2_2_23_1","volume-title":"Eva-Clip: Improved Training Techniques for Clip at Scale. arXiv preprint arXiv:2303.15389","author":"Sun Quan","year":"2023","unstructured":"Quan Sun, Yuxin Fang, Ledell Wu, Xinlong Wang, and Yue Cao. 2023. Eva-Clip: Improved Training Techniques for Clip at Scale. arXiv preprint arXiv:2303.15389 (2023)."},{"key":"e_1_3_2_2_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2021.3117124"},{"key":"e_1_3_2_2_25_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i7.32781"},{"key":"e_1_3_2_2_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547830"},{"key":"e_1_3_2_2_27_1","volume-title":"Gloss-driven Conditional Diffusion Models for Sign Language Production. ACM Transactions on Multimedia Computing, Communications and Applications","author":"Tang Shengeng","year":"2024","unstructured":"Shengeng Tang, Feng Xue, Jingjing Wu, Shuo Wang, and Richang Hong. 2024. Gloss-driven Conditional Diffusion Models for Sign Language Production. ACM Transactions on Multimedia Computing, Communications and Applications (2024)."},{"key":"e_1_3_2_2_28_1","volume-title":"MORE'25 Multimedia Object Re-ID: Advancements, Challenges, and Opportunities. In ACM Web Conference Workshop.","author":"Wang Yaxiong","year":"2025","unstructured":"Yaxiong Wang, Yunzhong Hou, Shuyu Yang, Zhedong Zheng, Zhun Zhong, and Liang Zheng. 2025. MORE'25 Multimedia Object Re-ID: Advancements, Challenges, and Opportunities. In ACM Web Conference Workshop."},{"key":"e_1_3_2_2_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2020.3024822"},{"key":"e_1_3_2_2_30_1","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2019\/526"},{"key":"e_1_3_2_2_31_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2024.104222"},{"key":"e_1_3_2_2_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/3682066"},{"key":"e_1_3_2_2_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/3643490.3661805"},{"key":"e_1_3_2_2_34_1","volume-title":"Beyond Walking: A Large-Scale Image-Text Benchmark for Text-based Person Anomaly Search. arXiv preprint arXiv:2411.17776","author":"Yang Shuyu","year":"2024","unstructured":"Shuyu Yang, Yaxiong Wang, Li Zhu, and Zhedong Zheng. 2024. Beyond Walking: A Large-Scale Image-Text Benchmark for Text-based Person Anomaly Search. arXiv preprint arXiv:2411.17776 (2024)."},{"key":"e_1_3_2_2_35_1","doi-asserted-by":"crossref","unstructured":"M Zaigham Zaheer Arif Mahmood M Haris Khan Mattia Segu Fisher Yu and Seung-Ik Lee. 2022. Generative Cooperative Learning for Unsupervised Video Anomaly Detection. In Computer Vision and Pattern Recognition. 14744--14754.","DOI":"10.1109\/CVPR52688.2022.01433"},{"key":"e_1_3_2_2_36_1","unstructured":"Yan Zeng Xinsong Zhang and Hang Li. 2022. Multi-Grained Vision Language Pre-Training: Aligning Texts with Visual Concepts. arXiv:2111.08276 [cs.CL]"},{"key":"e_1_3_2_2_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413518"},{"key":"e_1_3_2_2_38_1","volume-title":"Deep Cross-Modal Projection Learning for Image-Text Matching. In European Conference on Computer Vision. 686--701","author":"Zhang Ying","year":"2018","unstructured":"Ying Zhang and Huchuan Lu. 2018. Deep Cross-Modal Projection Learning for Image-Text Matching. In European Conference on Computer Vision. 686--701."},{"key":"e_1_3_2_2_39_1","volume-title":"Unified Vision-Language Pre-training for Image Captioning and Vqa. In AAAI Conference on Artificial Intelligence. 13041--13049","author":"Zhou Luowei","year":"2020","unstructured":"Luowei Zhou, Hamid Palangi, Lei Zhang, Houdong Hu, Jason Corso, and Jianfeng Gao. 2020. Unified Vision-Language Pre-training for Image Captioning and Vqa. In AAAI Conference on Artificial Intelligence. 13041--13049."}],"event":{"name":"WWW '25: The ACM Web Conference 2025","sponsor":["SIGWEB ACM Special Interest Group on Hypertext, Hypermedia, and Web"],"location":"Sydney NSW Australia","acronym":"WWW '25"},"container-title":["Companion Proceedings of the ACM on Web Conference 2025"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3701716.3717656","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3701716.3717656","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,10,8]],"date-time":"2025-10-08T03:08:50Z","timestamp":1759892930000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3701716.3717656"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,5,8]]},"references-count":39,"alternative-id":["10.1145\/3701716.3717656","10.1145\/3701716"],"URL":"https:\/\/doi.org\/10.1145\/3701716.3717656","relation":{},"subject":[],"published":{"date-parts":[[2025,5,8]]},"assertion":[{"value":"2025-05-23","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}