{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,26]],"date-time":"2025-03-26T08:23:05Z","timestamp":1742977385992,"version":"3.40.3"},"publisher-location":"Cham","reference-count":33,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031784552"},{"type":"electronic","value":"9783031784569"}],"license":[{"start":{"date-parts":[[2024,12,3]],"date-time":"2024-12-03T00:00:00Z","timestamp":1733184000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,12,3]],"date-time":"2024-12-03T00:00:00Z","timestamp":1733184000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-78456-9_22","type":"book-chapter","created":{"date-parts":[[2024,12,2]],"date-time":"2024-12-02T11:25:21Z","timestamp":1733138721000},"page":"341-357","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Distill the Knowledge of Multimodal Large Language Model into Text-to-Image Vehicle Re-identification"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-5326-4442","authenticated-orcid":false,"given":"Jianshu","family":"Zeng","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2524-0193","authenticated-orcid":false,"given":"Chi","family":"Zhang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,12,3]]},"reference":[{"key":"22_CR1","unstructured":"Achiam, J., Adler, S., Agarwal, S., Ahmad, L., Akkaya, I., Aleman, F.L., Almeida, D., Altenschmidt, J., Altman, S., Anadkat, S., et\u00a0al.: Gpt-4 technical report. arXiv preprint arXiv:2303.08774 (2023)"},{"key":"22_CR2","unstructured":"Bai, J., Bai, S., Yang, S., Wang, S., Tan, S., Wang, P., Lin, J., Zhou, C., Zhou, J.: Qwen-vl: A frontier large vision-language model with versatile abilities. arXiv preprint arXiv:2308.12966 (2023)"},{"key":"22_CR3","doi-asserted-by":"crossref","unstructured":"Bin, Y., Li, H., Xu, Y., Xu, X., Yang, Y., Shen, H.T.: Unifying two-stream encoders with transformers for cross-modal retrieval. In: Proceedings of the 31st ACM International Conference on Multimedia. pp. 3041\u20133050 (2023)","DOI":"10.1145\/3581783.3612427"},{"key":"22_CR4","first-page":"1877","volume":"33","author":"T Brown","year":"2020","unstructured":"Brown, T., Mann, B., Ryder, N., Subbiah, M., Kaplan, J.D., Dhariwal, P., Neelakantan, A., Shyam, P., Sastry, G., Askell, A., et al.: Language models are few-shot learners. Adv. Neural. Inf. Process. Syst. 33, 1877\u20131901 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"22_CR5","doi-asserted-by":"crossref","unstructured":"Chen, L., Li, J., Dong, X., Zhang, P., He, C., Wang, J., Zhao, F., Lin, D.: Sharegpt4v: Improving large multi-modal models with better captions. arXiv preprint arXiv:2311.12793 (2023)","DOI":"10.1007\/978-3-031-72643-9_22"},{"key":"22_CR6","doi-asserted-by":"publisher","first-page":"171","DOI":"10.1016\/j.neucom.2022.04.081","volume":"494","author":"Y Chen","year":"2022","unstructured":"Chen, Y., Zhang, G., Lu, Y., Wang, Z., Zheng, Y.: Tipcb: A simple but effective part-based convolutional baseline for text-based person search. Neurocomputing 494, 171\u2013181 (2022)","journal-title":"Neurocomputing"},{"key":"22_CR7","doi-asserted-by":"crossref","unstructured":"Chen, Z., Wu, J., Wang, W., Su, W., Chen, G., Xing, S., Zhong, M., Zhang, Q., Zhu, X., Lu, L., et\u00a0al.: Internvl: Scaling up vision foundation models and aligning for generic visual-linguistic tasks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 24185\u201324198 (2024)","DOI":"10.1109\/CVPR52733.2024.02283"},{"key":"22_CR8","doi-asserted-by":"crossref","unstructured":"Cui, C., Ma, Y., Cao, X., Ye, W., Zhou, Y., Liang, K., Chen, J., Lu, J., Yang, Z., Liao, K.D., et\u00a0al.: A survey on multimodal large language models for autonomous driving. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision. pp. 958\u2013979 (2024)","DOI":"10.1109\/WACVW60836.2024.00106"},{"key":"22_CR9","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)"},{"key":"22_CR10","doi-asserted-by":"crossref","unstructured":"Ding, L., Liu, L., Huang, Y., Li, C., Zhang, C., Wang, W., Wang, L.: Text-to-image vehicle re-identification: Multi-scale multi-view cross-modal alignment network and a unified benchmark. IEEE Transactions on Intelligent Transportation Systems (2024)","DOI":"10.1109\/TITS.2023.3348599"},{"key":"22_CR11","unstructured":"Ding, Z., Ding, C., Shao, Z., Tao, D.: Semantically self-aligned network for text-to-image part-aware person re-identification. arXiv preprint arXiv:2107.12666 (2021)"},{"key":"22_CR12","unstructured":"Fu, C., Chen, P., Shen, Y., Qin, Y., Zhang, M., Lin, X., Yang, J., Zheng, X., Li, K., Sun, X., Wu, Y., Ji, R.: Mme: A comprehensive evaluation benchmark for multimodal large language models. arXiv preprint arXiv:2306.13394 (2023)"},{"key":"22_CR13","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"22_CR14","doi-asserted-by":"crossref","unstructured":"He, S., Luo, H., Wang, P., Wang, F., Li, H., Jiang, W.: Transreid: Transformer-based object re-identification. In: Proceedings of the IEEE\/CVF international conference on computer vision. pp. 15013\u201315022 (2021)","DOI":"10.1109\/ICCV48922.2021.01474"},{"key":"22_CR15","doi-asserted-by":"publisher","first-page":"853","DOI":"10.1613\/jair.3994","volume":"47","author":"M Hodosh","year":"2013","unstructured":"Hodosh, M., Young, P., Hockenmaier, J.: Framing image description as a ranking task: Data, models and evaluation metrics. Journal of Artificial Intelligence Research 47, 853\u2013899 (2013)","journal-title":"Journal of Artificial Intelligence Research"},{"key":"22_CR16","doi-asserted-by":"crossref","unstructured":"Jia, S., Lyu, R., Zhao, K., Chen, Y., Yan, Z., Ju, Y., Hu, C., Li, X., Wu, B., Lyu, S.: Can chatgpt detect deepfakes? a study of using multimodal large language models for media forensics. arXiv preprint arXiv:2403.14077 (2024)","DOI":"10.1109\/CVPRW63382.2024.00436"},{"key":"22_CR17","doi-asserted-by":"publisher","first-page":"2238","DOI":"10.1109\/LSP.2022.3217682","volume":"29","author":"S Li","year":"2022","unstructured":"Li, S., Lu, A., Huang, Y., Li, C., Wang, L.: Joint token and feature alignment framework for text-based person search. IEEE Signal Process. Lett. 29, 2238\u20132242 (2022)","journal-title":"IEEE Signal Process. Lett."},{"key":"22_CR18","doi-asserted-by":"crossref","unstructured":"Liu, H., Li, C., Li, Y., Lee, Y.J.: Improved baselines with visual instruction tuning. arXiv preprint arXiv:2310.03744 (2023)","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"22_CR19","doi-asserted-by":"crossref","unstructured":"Liu, H., Tian, Y., Yang, Y., Pang, L., Huang, T.: Deep relative distance learning: Tell the difference between similar vehicles. In: Proceedings of Conference on Computer Vision and Pattern Recognition. pp. 2167\u20132175 (2016)","DOI":"10.1109\/CVPR.2016.238"},{"issue":"3","key":"22_CR20","doi-asserted-by":"publisher","first-page":"645","DOI":"10.1109\/TMM.2017.2751966","volume":"20","author":"X Liu","year":"2018","unstructured":"Liu, X., Liu, W., Mei, T., Ma, H.: Provid: Progressive and multimodal vehicle reidentification for large-scale urban surveillance. IEEE Trans. Multimedia 20(3), 645\u2013658 (2018). https:\/\/doi.org\/10.1109\/TMM.2017.2751966","journal-title":"IEEE Trans. Multimedia"},{"key":"22_CR21","doi-asserted-by":"crossref","unstructured":"Lou, Y., Bai, Y., Liu, J., Wang, S., Duan, L.: Veri-wild: A large dataset and a new method for vehicle re-identification in the wild. In: Proceedings of Conference on Computer Vision and Pattern Recognition. pp. 3235\u20133243 (2019)","DOI":"10.1109\/CVPR.2019.00335"},{"key":"22_CR22","doi-asserted-by":"crossref","unstructured":"Oh\u00a0Song, H., Xiang, Y., Jegelka, S., Savarese, S.: Deep metric learning via lifted structured feature embedding. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp. 4004\u20134012 (2016)","DOI":"10.1109\/CVPR.2016.434"},{"key":"22_CR23","unstructured":"Oord, A.v.d., Li, Y., Vinyals, O.: Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748 (2018)"},{"key":"22_CR24","unstructured":"OpenAI: Gpt-4v(ision) system card (2023), https:\/\/openai.com\/research\/gpt-4v-system-card"},{"key":"22_CR25","unstructured":"Radford, A., Narasimhan, K., Salimans, T., Sutskever, I., et\u00a0al.: Improving language understanding by generative pre-training (2018), https:\/\/www.mikecaptain.com\/resources\/pdf\/GPT-1.pdf"},{"key":"22_CR26","doi-asserted-by":"crossref","unstructured":"Shao, Z., Zhang, X., Fang, M., Lin, Z., Wang, J., Ding, C.: Learning granularity-unified representations for text-to-image person re-identification. In: Proceedings of the 30th acm international conference on multimedia. pp. 5566\u20135574 (2022)","DOI":"10.1145\/3503161.3548028"},{"key":"22_CR27","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A.N., Kaiser, \u0141., Polosukhin, I.: Attention is all you need. Advances in neural information processing systems 30 (2017)"},{"key":"22_CR28","unstructured":"Wang, W., Mrini, K., Yang, L., Kumar, S., Tian, Y., Yan, X., Wang, H.: Finetuned multimodal language models are high-quality image-text data filters. arXiv preprint arXiv:2403.02677 (2024)"},{"key":"22_CR29","unstructured":"Wen, L., Yang, X., Fu, D., Wang, X., Cai, P., Li, X., Ma, T., Li, Y., Xu, L., Shang, D., et\u00a0al.: On the road with gpt-4v (ision): Early explorations of visual-language model on autonomous driving. arXiv preprint arXiv:2311.05332 (2023)"},{"key":"22_CR30","doi-asserted-by":"crossref","unstructured":"Yin, S., Fu, C., Zhao, S., Li, K., Sun, X., Xu, T., Chen, E.: A survey on multimodal large language models. arXiv preprint arXiv:2306.13549 (2024)","DOI":"10.1093\/nsr\/nwae403"},{"key":"22_CR31","unstructured":"Zheng, T., Milind, N., Ming-Yu, L., Xiaodong, Y., Stan, B., Shuo, W., Ratnesh, K., David, A., Jenq-Neng, H.: Cityflow: A city-scale benchmark for multi-target multi-camera vehicle tracking and re-identification. In: CVPR (2019)"},{"key":"22_CR32","doi-asserted-by":"crossref","unstructured":"Zheng, Z., Jiang, M., Wang, Z., Wang, J., Bai, Z., Zhang, X., Yu, X., Tan, X., Yang, Y., Wen, S., et\u00a0al.: Going beyond real data: A robust visual representation for vehicle re-identification. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition Workshops. pp. 598\u2013599 (2020)","DOI":"10.1109\/CVPRW50498.2020.00307"},{"key":"22_CR33","doi-asserted-by":"publisher","first-page":"2683","DOI":"10.1109\/TMM.2020.3014488","volume":"23","author":"Z Zheng","year":"2020","unstructured":"Zheng, Z., Ruan, T., Wei, Y., Yang, Y., Mei, T.: Vehiclenet: Learning robust visual representation for vehicle re-identification. IEEE Trans. Multimedia 23, 2683\u20132693 (2020)","journal-title":"IEEE Trans. Multimedia"}],"container-title":["Lecture Notes in Computer Science","Pattern Recognition"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-78456-9_22","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,12,2]],"date-time":"2024-12-02T12:13:30Z","timestamp":1733141610000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-78456-9_22"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,3]]},"ISBN":["9783031784552","9783031784569"],"references-count":33,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-78456-9_22","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,12,3]]},"assertion":[{"value":"3 December 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICPR","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Pattern Recognition","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Kolkata","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"India","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"1 December 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"5 December 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"icpr2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/icpr2024.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}