{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,9]],"date-time":"2025-10-09T00:19:09Z","timestamp":1759969149227,"version":"build-2065373602"},"publisher-location":"New York, NY, USA","reference-count":17,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,5,8]],"date-time":"2025-05-08T00:00:00Z","timestamp":1746662400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,5,8]]},"DOI":"10.1145\/3701716.3717655","type":"proceedings-article","created":{"date-parts":[[2025,5,23]],"date-time":"2025-05-23T16:20:01Z","timestamp":1748017201000},"page":"1586-1588","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Enhancing Text-Based Person Retrieval via Loss Balancing and Vision-Language Models"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-2062-725X","authenticated-orcid":false,"given":"Jizheng","family":"Zhang","sequence":"first","affiliation":[{"name":"ZhCheng, College of Software Engineering, Jilin University, Changchun, Jilin, China"}]}],"member":"320","published-online":{"date-parts":[[2025,5,23]]},"reference":[{"key":"e_1_3_2_2_1_1","volume-title":"Re-id done right: towards good practices for person re-identification. arXiv preprint arXiv:1801.05339","author":"Almazan Jon","year":"2018","unstructured":"Jon Almazan, Bojana Gajic, Naila Murray, and Diane Larlus. 2018. Re-id done right: towards good practices for person re-identification. arXiv preprint arXiv:1801.05339 (2018)."},{"key":"e_1_3_2_2_2_1","volume-title":"Multitask learning. Machine learning 28","author":"Caruana Rich","year":"1997","unstructured":"Rich Caruana. 1997. Multitask learning. Machine learning 28 (1997), 41--75."},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2022.04.081"},{"key":"e_1_3_2_2_4_1","volume-title":"International conference on machine learning. PMLR, 794--803","author":"Chen Zhao","year":"2018","unstructured":"Zhao Chen, Vijay Badrinarayanan, Chen-Yu Lee, and Andrew Rabinovich. 2018. Gradnorm: Gradient normalization for adaptive loss balancing in deep multitask networks. In International conference on machine learning. PMLR, 794--803."},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/3240508.3240550"},{"key":"e_1_3_2_2_6_1","volume-title":"Proceedings of the IEEE conference on computer vision and pattern recognition. 7482--7491","author":"Kendall Alex","year":"2018","unstructured":"Alex Kendall, Yarin Gal, and Roberto Cipolla. 2018. Multi-task learning using uncertainty to weigh losses for scene geometry and semantics. In Proceedings of the IEEE conference on computer vision and pattern recognition. 7482--7491."},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.551"},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298832"},{"key":"e_1_3_2_2_9_1","volume-title":"Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. Advances in neural information processing systems 32","author":"Lu Jiasen","year":"2019","unstructured":"Jiasen Lu, Dhruv Batra, Devi Parikh, and Stefan Lee. 2019. Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. Advances in neural information processing systems 32 (2019)."},{"key":"e_1_3_2_2_10_1","volume-title":"International conference on machine learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748--8763."},{"key":"e_1_3_2_2_11_1","volume-title":"MORE'25 Multimedia Object Re-ID: Advancements, Challenges, and Opportunities. In ACM Web Conference Workshop.","author":"Wang Yaxiong","year":"2025","unstructured":"Yaxiong Wang, Yunzhong Hou, Shuyu Yang, Zhedong Zheng, Zhun Zhong, and Liang Zheng. 2025. MORE'25 Multimedia Object Re-ID: Advancements, Challenges, and Opportunities. In ACM Web Conference Workshop."},{"key":"e_1_3_2_2_12_1","volume-title":"End-to-end deep learning for person search. arXiv preprint arXiv:1604.01850 2, 2","author":"Xiao Tong","year":"2016","unstructured":"Tong Xiao, Shuang Li, Bochao Wang, Liang Lin, and Xiaogang Wang. 2016. End-to-end deep learning for person search. arXiv preprint arXiv:1604.01850 2, 2 (2016), 4."},{"key":"e_1_3_2_2_13_1","volume-title":"Beyond Walking: A Large-Scale Image-Text Benchmark for Text-based Person Anomaly Search. arXiv preprint arXiv:2411.17776","author":"Yang Shuyu","year":"2024","unstructured":"Shuyu Yang, Yaxiong Wang, Li Zhu, and Zhedong Zheng. 2024. Beyond Walking: A Large-Scale Image-Text Benchmark for Text-based Person Anomaly Search. arXiv preprint arXiv:2411.17776 (2024)."},{"key":"e_1_3_2_2_14_1","volume-title":"Multi-Grained Vision Language Pre-Training: Aligning Texts with Visual Concepts. arXiv preprint arXiv:2111.08276","author":"Zeng Yan","year":"2021","unstructured":"Yan Zeng, Xinsong Zhang, and Hang Li. 2021. Multi-Grained Vision Language Pre-Training: Aligning Texts with Visual Concepts. arXiv preprint arXiv:2111.08276 (2021)."},{"key":"e_1_3_2_2_15_1","volume-title":"All-In-One Pre-trained Model For Vision-Language Tasks. arXiv preprint arXiv:2211.12402","author":"Zeng Yan","year":"2022","unstructured":"Yan Zeng, Xinsong Zhang, Hang Li, Jiawei Wang, Jipeng Zhang, and Wangchunshu Zhou. 2022. X?2-VLM: All-In-One Pre-trained Model For Vision-Language Tasks. arXiv preprint arXiv:2211.12402 (2022)."},{"key":"e_1_3_2_2_16_1","volume-title":"Cross-view language modeling: Towards unified cross-lingual cross-modal pre-training. arXiv preprint arXiv:2206.00621","author":"Zeng Yan","year":"2022","unstructured":"Yan Zeng, Wangchunshu Zhou, Ao Luo, and Xinsong Zhang. 2022. Cross-view language modeling: Towards unified cross-lingual cross-modal pre-training. arXiv preprint arXiv:2206.00621 (2022)."},{"key":"e_1_3_2_2_17_1","volume-title":"Chatgpt asks, blip-2 answers: Automatic questioning towards enriched visual descriptions. arXiv preprint arXiv:2303.06594","author":"Zhu Deyao","year":"2023","unstructured":"Deyao Zhu, Jun Chen, Kilichbek Haydarov, Xiaoqian Shen, Wenxuan Zhang, and Mohamed Elhoseiny. 2023. Chatgpt asks, blip-2 answers: Automatic questioning towards enriched visual descriptions. arXiv preprint arXiv:2303.06594 (2023)."}],"event":{"name":"WWW '25: The ACM Web Conference 2025","sponsor":["SIGWEB ACM Special Interest Group on Hypertext, Hypermedia, and Web"],"location":"Sydney NSW Australia","acronym":"WWW '25"},"container-title":["Companion Proceedings of the ACM on Web Conference 2025"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3701716.3717655","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3701716.3717655","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,10,8]],"date-time":"2025-10-08T03:08:26Z","timestamp":1759892906000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3701716.3717655"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,5,8]]},"references-count":17,"alternative-id":["10.1145\/3701716.3717655","10.1145\/3701716"],"URL":"https:\/\/doi.org\/10.1145\/3701716.3717655","relation":{},"subject":[],"published":{"date-parts":[[2025,5,8]]},"assertion":[{"value":"2025-05-23","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}