{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,19]],"date-time":"2026-02-19T08:03:29Z","timestamp":1771488209661,"version":"3.50.1"},"reference-count":48,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2024,11,13]],"date-time":"2024-11-13T00:00:00Z","timestamp":1731456000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0"},{"start":{"date-parts":[[2024,11,13]],"date-time":"2024-11-13T00:00:00Z","timestamp":1731456000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0"}],"funder":[{"DOI":"10.13039\/100014718","name":"Innovative Research Group Project of the National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62072465"],"award-info":[{"award-number":["62072465"]}],"id":[{"id":"10.13039\/100014718","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Complex Intell. Syst."],"published-print":{"date-parts":[[2025,1]]},"DOI":"10.1007\/s40747-024-01614-w","type":"journal-article","created":{"date-parts":[[2024,11,13]],"date-time":"2024-11-13T09:02:20Z","timestamp":1731488540000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["Bridging the gap: multi-granularity representation learning for text-based vehicle retrieval"],"prefix":"10.1007","volume":"11","author":[{"given":"Xue","family":"Bo","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Junjie","family":"Liu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Di","family":"Yang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Wentao","family":"Ma","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,11,13]]},"reference":[{"key":"1614_CR1","doi-asserted-by":"crossref","unstructured":"He S, Luo H, Chen W, Zhang M, Zhang Y, Wang F, Li H, Jiang W (2020) Multi-domain learning and identity mining for vehicle re-identification. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition Workshops, pp 582\u2013583","DOI":"10.1109\/CVPRW50498.2020.00299"},{"key":"1614_CR2","doi-asserted-by":"crossref","unstructured":"Zhu X, Luo Z, Fu P, Ji X (2020) Voc-reid: Vehicle re-identification based on vehicle-orientation-camera. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition Workshops, pp 602\u2013603","DOI":"10.1109\/CVPRW50498.2020.00309"},{"key":"1614_CR3","doi-asserted-by":"crossref","unstructured":"Meng D, Li L, Liu X, Li Y, Yang S, Zha Z-J, Gao X, Wang S, Huang Q (2020) Parsing-based view-aware embedding network for vehicle re-identification. In: Proceedings of the IEEE Conference on computer vision and pattern recognition, pp 7103\u20137112","DOI":"10.1109\/CVPR42600.2020.00713"},{"issue":"3","key":"1614_CR4","doi-asserted-by":"publisher","first-page":"45","DOI":"10.1007\/s10462-023-10678-y","volume":"57","author":"M Pavlov-Kagadejev","year":"2024","unstructured":"Pavlov-Kagadejev M, Jovanovic L, Bacanin N, Deveci M, Zivkovic M, Tuba M, Strumberger I, Pedrycz W (2024) Optimizing long-short-term memory models via metaheuristics for decomposition aided wind energy generation forecasting. Artif Intell Rev 57(3):45","journal-title":"Artif Intell Rev"},{"key":"1614_CR5","doi-asserted-by":"crossref","unstructured":"Chen T-S, Liu C-T, Wu C-W, Chien S-Y (2020) Orientation-aware vehicle re-identification with semantics-guided part attention network. In: Proceedings of the European Conference on computer vision, pp 330\u2013346. Springer","DOI":"10.1007\/978-3-030-58536-5_20"},{"key":"1614_CR6","doi-asserted-by":"crossref","unstructured":"Khorramshahi P, Peri N, Chen J-c, Chellappa R (2020) The devil is in the details: self-supervised attention for vehicle re-identification. In: Proceedings of the European Conference on computer vision, pp 369\u2013386. Springer","DOI":"10.1007\/978-3-030-58568-6_22"},{"key":"1614_CR7","doi-asserted-by":"publisher","first-page":"1039","DOI":"10.1109\/TIP.2023.3238642","volume":"32","author":"F Shen","year":"2023","unstructured":"Shen F, Xie Y, Zhu J, Zhu X, Zeng H (2023) Git: graph interactive transformer for vehicle re-identification. IEEE Trans Image Process 32:1039\u20131051","journal-title":"IEEE Trans Image Process"},{"issue":"2","key":"1614_CR8","doi-asserted-by":"publisher","first-page":"519","DOI":"10.1007\/s11280-022-01060-z","volume":"26","author":"J Li","year":"2023","unstructured":"Li J, Cong Y, Zhou L, Tian Z, Qiu J (2023) Super-resolution-based part collaboration network for vehicle re-identification. World Wide Web 26(2):519\u2013538","journal-title":"World Wide Web"},{"key":"1614_CR9","doi-asserted-by":"publisher","first-page":"26719","DOI":"10.1109\/ACCESS.2024.3367588","volume":"12","author":"J Cincovic","year":"2024","unstructured":"Cincovic J, Jovanovic L, Nikolic B, Bacanin N (2024) Neurodegenerative condition detection using modified metaheuristic for attention based recurrent neural networks and extreme gradient boosting tuning. IEEE Access 12:26719\u201326734","journal-title":"IEEE Access"},{"key":"1614_CR10","unstructured":"Feng Q, Ablavsky V, Sclaroff S (2021) Cityflow-nl: tracking and retrieval of vehicles at city scale by natural language descriptions. arXiv preprint arXiv:2101.04741"},{"key":"1614_CR11","doi-asserted-by":"crossref","unstructured":"Scribano C, Sapienza D, Franchini G, Verucchi M, Bertogna M (2021) All you can embed: Natural language based vehicle retrieval with spatio-temporal transformers. In: Proceedings of the IEEE Conference on computer vision and pattern recognition, pp 4253\u20134262","DOI":"10.1109\/CVPRW53098.2021.00481"},{"key":"1614_CR12","unstructured":"Turc I, Chang M-W, Lee K, Toutanova K (2019) Well-read students learn better: On the importance of pre-training compact models. arXiv preprint arXiv:1908.08962"},{"key":"1614_CR13","doi-asserted-by":"crossref","unstructured":"Bai S, Zheng Z, Wang X, Lin J, Zhang Z, Zhou C, Yang H, Yang Y (2021) Connecting language and vision for natural language-based vehicle retrieval. In: Proceedings of the IEEE Conference on computer vision and pattern recognition, pp 4034\u20134043","DOI":"10.1109\/CVPRW53098.2021.00455"},{"key":"1614_CR14","doi-asserted-by":"crossref","unstructured":"Ngo BH, Nguyen DT, Do-Tran N-T, Thien PPH, An M-H, Nguyen T-N, Hoang LN, Nguyen VD, Dinh V (2023) Comprehensive visual features and pseudo labeling for robust natural language-based vehicle retrieval. In: Proceedings of the IEEE Conference on computer vision and pattern recognition, pp 5408\u20135417","DOI":"10.1109\/CVPRW59228.2023.00571"},{"key":"1614_CR15","doi-asserted-by":"crossref","unstructured":"Xie D, Liu L, Zhang S, Tian J (2023) A unified multi-modal structure for retrieving tracked vehicles through natural language descriptions. In: Proceedings of the IEEE Conference on computer vision and pattern recognition, pp 5418\u20135426","DOI":"10.1109\/CVPRW59228.2023.00572"},{"key":"1614_CR16","doi-asserted-by":"crossref","unstructured":"Zhang J, Lin X, Jiang M, Yu Y, Gong C, Zhang W, Tan, X, Li Y, Ding E, Li G (2022) A multi-granularity retrieval system for natural language-based vehicle retrieval. In: Proceedings of the IEEE Conference on computer vision and pattern recognition, pp 3216\u20133225","DOI":"10.1109\/CVPRW56347.2022.00363"},{"key":"1614_CR17","doi-asserted-by":"crossref","unstructured":"Zhao C, Chen H, Zhang W, Chen J, Zhang S, Li Y, Li B (2022) Symmetric network with spatial relationship modeling for natural language-based vehicle retrieval. In: Proceedings of the IEEE Conference on computer vision and pattern recognition, pp. 3226\u20133233","DOI":"10.1109\/CVPRW56347.2022.00364"},{"key":"1614_CR18","doi-asserted-by":"crossref","unstructured":"Du Y, Zhang B, Ruan X, Su F, Zhao Z, Chen H (2022) Omg: Observe multiple granularities for natural language-based vehicle retrieval. In: Proceedings of the IEEE Conference on computer vision and pattern recognition, pp 3124\u20133133","DOI":"10.1109\/CVPRW56347.2022.00352"},{"key":"1614_CR19","doi-asserted-by":"crossref","unstructured":"Sun Z, Liu X, Bi X, Nie X, Yin Y (2021) Dun: Dual-path temporal matching network for natural language-based vehicle retrieval. In: Proceedings of the IEEE Conference on computer vision and pattern recognition, pp 4061\u20134067","DOI":"10.1109\/CVPRW53098.2021.00458"},{"key":"1614_CR20","doi-asserted-by":"crossref","unstructured":"Khorramshahi P, Rambhatla SS, Chellappa R (2021) Towards accurate visual and natural language-based vehicle retrieval systems. In: Proceedings of the IEEE Conference on computer vision and pattern recognition, pp 4183\u20134192","DOI":"10.1109\/CVPRW53098.2021.00472"},{"key":"1614_CR21","doi-asserted-by":"crossref","unstructured":"Nguyen T-P, Tran-Le B-T, Thai X-D, Nguyen TV, Do MN, Tran M-T (2021) Traffic video event retrieval via text query using vehicle appearance and motion attributes. In: Proceedings of the IEEE Conference on computer vision and pattern recognition, pp 4165\u20134172","DOI":"10.1109\/CVPRW53098.2021.00470"},{"key":"1614_CR22","first-page":"1","volume":"99","author":"W Ma","year":"2022","unstructured":"Ma W, Chen Q, Liu F, Zhou T, Cai Z (2022) Query-adaptive late fusion for hierarchical fine-grained video-text retrieval. IEEE Trans Neural Netw Learn Syst 99:1\u201312","journal-title":"IEEE Trans Neural Netw Learn Syst"},{"key":"1614_CR23","doi-asserted-by":"crossref","unstructured":"Ma W, Chen Q, Zhou T, Zhao S, Cai Z (2023) Using multimodal contrastive knowledge distillation for video-text retrieval. IEEE Trans Circuits Syst Video Technol 33(10):5486\u20135497","DOI":"10.1109\/TCSVT.2023.3257193"},{"key":"1614_CR24","doi-asserted-by":"crossref","unstructured":"Jin W, Zhao Z, Zhang P, Zhu J, He X, Zhuang Y (2021) Hierarchical cross-modal graph consistency learning for video-text retrieval. In: Proceedings of the 44th International ACM SIGIR Conference on research and development in information retrieval, pp 1114\u20131124","DOI":"10.1145\/3404835.3462974"},{"key":"1614_CR25","doi-asserted-by":"crossref","unstructured":"Chen S, Zhao Y, Jin Q, Wu Q (2020) Fine-grained video-text retrieval with hierarchical graph reasoning. In: Proceedings of the IEEE Conference on computer vision and pattern recognition, pp 10638\u201310647","DOI":"10.1109\/CVPR42600.2020.01065"},{"key":"1614_CR26","unstructured":"Oord Avd, Li Y, Vinyals O (2018) Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748"},{"key":"1614_CR27","unstructured":"Huo Y, Zhang M, Liu G, Lu H, Gao Y, Yang G, Wen J, Zhang H, Xu B, Zheng W, et al (2021) Wenlan: Bridging vision and language by large-scale multi-modal pre-training. arXiv preprint arXiv:2103.06561"},{"key":"1614_CR28","unstructured":"Zimmermann RS, Sharma Y, Schneider S, Bethge M, Brendel W (2021) Contrastive learning inverts the data generating process. In: Proceedings of the International Conference on machine learning, pp. 12979\u201312990. PMLR"},{"key":"1614_CR29","doi-asserted-by":"crossref","unstructured":"Ma Y, Xu G, Sun X, Yan M, Zhang J, Ji R (2022) X-clip: end-to-end multi-grained contrastive learning for video-text retrieval. In: Proceedings of the 30th ACM International Conference on multimedia, pp 638\u2013647","DOI":"10.1145\/3503161.3547910"},{"key":"1614_CR30","doi-asserted-by":"crossref","unstructured":"Zhao S, Xu L, Liu Y, Du S (2023) Multi-grained representation learning for cross-modal retrieval. In: Proceedings of the International ACM SIGIR Conference on research and development in information retrieval, pp 2194\u20132198","DOI":"10.1145\/3539618.3592025"},{"key":"1614_CR31","unstructured":"Huang Z, Xu W, Yu K (2015) Bidirectional lstm-crf models for sequence tagging. arXiv preprint arXiv:1508.01991"},{"issue":"10","key":"1614_CR32","doi-asserted-by":"publisher","first-page":"2451","DOI":"10.1162\/089976600300015015","volume":"12","author":"FA Gers","year":"2000","unstructured":"Gers FA, Schmidhuber J, Cummins F (2000) Learning to forget: continual prediction with lstm. Neural Comput 12(10):2451\u20132471","journal-title":"Neural Comput"},{"key":"1614_CR33","unstructured":"Devlin J, Chang M-W, Lee K, Toutanova K (2018) Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805"},{"key":"1614_CR34","doi-asserted-by":"crossref","unstructured":"He K, Chen X, Xie S, Li Y, Doll\u00e1r P, Girshick R (2022) Masked autoencoders are scalable vision learners. In: Proceedings of the IEEE Conference on computer vision and pattern recognition, pp. 16000\u201316009","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"1614_CR35","doi-asserted-by":"crossref","unstructured":"Fan Y, Lu X, Li D, Liu Y (2016) Video-based emotion recognition using cnn-rnn and c3d hybrid networks. In: Proceedings of the ACM International Conference on multimodal interaction, pp. 445\u2013450","DOI":"10.1145\/2993148.2997632"},{"key":"1614_CR36","doi-asserted-by":"crossref","unstructured":"Xu H, Das A, Saenko K (2017) R-c3d: Region convolutional 3d network for temporal activity detection. In: Proceedings of the IEEE International Conference on computer vision, pp 5783\u20135792","DOI":"10.1109\/ICCV.2017.617"},{"key":"1614_CR37","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2022.109258","volume":"137","author":"W Zhu","year":"2023","unstructured":"Zhu W, Wang Z, Wang X, Hu R, Liu H, Liu C, Wang C, Li D (2023) A dual self-attention mechanism for vehicle re-identification. Pattern Recogn 137:109258","journal-title":"Pattern Recogn"},{"key":"1614_CR38","doi-asserted-by":"crossref","unstructured":"Khorramshahi P, Kumar A, Peri N, Rambhatla SS, Chen J-C, Chellappa R (2019) A dual-path model with adaptive attention for vehicle re-identification. In: Proceedings of the IEEE International Conference on computer vision, pp. 6132\u20136141","DOI":"10.1109\/ICCV.2019.00623"},{"key":"1614_CR39","doi-asserted-by":"crossref","unstructured":"Chu R, Sun Y, Li Y, Liu Z, Zhang C, Wei Y (2019) Vehicle re-identification with viewpoint-aware metric learning. In: Proceedings of the IEEE International Conference on computer vision, pp 8282\u20138291","DOI":"10.1109\/ICCV.2019.00837"},{"key":"1614_CR40","doi-asserted-by":"crossref","unstructured":"Kalchbrenner N, Grefenstette E, Blunsom P (2014) A convolutional neural network for modelling sentences. arXiv preprint arXiv:1404.2188","DOI":"10.3115\/v1\/P14-1062"},{"key":"1614_CR41","unstructured":"Yao L, Huang R, Hou L, Lu G, Niu M, Xu H, Liang X, Li Z, Jiang X, Xu C (2021) Filip: fine-grained interactive language-image pre-training. arXiv preprint arXiv:2111.07783"},{"key":"1614_CR42","doi-asserted-by":"crossref","unstructured":"Mei X, Liu X, Sun J, Plumbley MD, Wang W (2022) On metric learning for audio-text cross-modal retrieval. arXiv preprint arXiv:2203.15537","DOI":"10.21437\/Interspeech.2022-11115"},{"key":"1614_CR43","doi-asserted-by":"crossref","unstructured":"Tang Z, Naphade M, Liu M-Y, Yang X, Birchfield S, Wang S, Kumar R, Anastasiu D, Hwang J-N (2019) Cityflow: a city-scale benchmark for multi-target multi-camera vehicle tracking and re-identification. In: Proceedings of the IEEE Conference on computer vision and pattern recognition, pp 8797\u20138806","DOI":"10.1109\/CVPR.2019.00900"},{"key":"1614_CR44","doi-asserted-by":"crossref","unstructured":"He K, Zhang X, Ren S, Sun J (2016) Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on computer vision and pattern recognition, pp 770\u2013778","DOI":"10.1109\/CVPR.2016.90"},{"key":"1614_CR45","doi-asserted-by":"crossref","unstructured":"Park E-J, Kim H, Jeong S, Kang B, Kwon Y (2021) Keyword-based vehicle retrieval. In: Proceedings of the IEEE Conference on computer vision and pattern recognition, pp 4220\u20134227","DOI":"10.1109\/CVPRW53098.2021.00477"},{"key":"1614_CR46","doi-asserted-by":"crossref","unstructured":"Nguyen TM, Pham QH, Doan LB, Trinh HV, Nguyen V-A, Phan V-H (2021) Contrastive learning for natural language-based vehicle retrieval. In: Proceedings of the IEEE Conference on computer vision and pattern recognition, pp 4245\u20134252","DOI":"10.1109\/CVPRW53098.2021.00480"},{"key":"1614_CR47","doi-asserted-by":"crossref","unstructured":"Sebastian C, Imbriaco R, Meletis P, Dubbelman G, Bondarev E (2021) Tied: a cycle consistent encoder-decoder model for text-to-image retrieval. In: Proceedings of the IEEE Conference on computer vision and pattern recognition, pp 4138\u20134146","DOI":"10.1109\/CVPRW53098.2021.00467"},{"key":"1614_CR48","doi-asserted-by":"crossref","unstructured":"Lee S, Woo T, Lee SH (2021) Sbnet: Segmentation-based network for natural language-based vehicle search. In: Proceedings of the IEEE Conference on computer vision and pattern recognition, pp 4054\u20134060","DOI":"10.1109\/CVPRW53098.2021.00457"}],"container-title":["Complex &amp; Intelligent Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s40747-024-01614-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s40747-024-01614-w\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s40747-024-01614-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,1,30]],"date-time":"2025-01-30T20:16:23Z","timestamp":1738268183000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s40747-024-01614-w"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,13]]},"references-count":48,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2025,1]]}},"alternative-id":["1614"],"URL":"https:\/\/doi.org\/10.1007\/s40747-024-01614-w","relation":{},"ISSN":["2199-4536","2198-6053"],"issn-type":[{"value":"2199-4536","type":"print"},{"value":"2198-6053","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,11,13]]},"assertion":[{"value":"19 March 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"27 September 2024","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"13 November 2024","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors of this paper assert that they have no known financial interests or personal relationships that could have potentially biased the work presented herein.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}],"article-number":"32"}}