{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,17]],"date-time":"2025-09-17T16:51:39Z","timestamp":1758127899628,"version":"3.38.0"},"reference-count":37,"publisher":"Springer Science and Business Media LLC","issue":"4","license":[{"start":{"date-parts":[[2024,6,19]],"date-time":"2024-06-19T00:00:00Z","timestamp":1718755200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,6,19]],"date-time":"2024-06-19T00:00:00Z","timestamp":1718755200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"crossref","award":["62376271","62376271","62376271","62376271","62376271","62376271"],"award-info":[{"award-number":["62376271","62376271","62376271","62376271","62376271","62376271"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"crossref"}]},{"name":"Beijing Natural Science Foundation","award":["L231013","L231013","L231013","L231013","L231013","L231013"],"award-info":[{"award-number":["L231013","L231013","L231013","L231013","L231013","L231013"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Vis Comput"],"published-print":{"date-parts":[[2025,3]]},"DOI":"10.1007\/s00371-024-03544-7","type":"journal-article","created":{"date-parts":[[2024,6,19]],"date-time":"2024-06-19T19:02:32Z","timestamp":1718823752000},"page":"2425-2437","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":3,"title":["ROMOT: Referring-expression-comprehension open-set multi-object tracking"],"prefix":"10.1007","volume":"41","author":[{"given":"Wei","family":"Li","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Bowen","family":"Li","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jingqi","family":"Wang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Weiliang","family":"Meng","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jiguang","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xiaopeng","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,6,19]]},"reference":[{"key":"3544_CR1","first-page":"726","volume":"34","author":"Z Wang","year":"2021","unstructured":"Wang, Z., Zhao, H., Li, Y.-L., Wang, S., Torr, P., Bertinetto, L.: Do different tracking tasks require different appearance models? Adv. Neural. Inf. Process. Syst. 34, 726\u2013738 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"3544_CR2","doi-asserted-by":"publisher","first-page":"15380","DOI":"10.1109\/TPAMI.2023.3301975","volume":"45","author":"T Fischer","year":"2023","unstructured":"Fischer, T., Huang, T.E., Pang, J., Qiu, L., Chen, H., Darrell, T., Yu, F.: Qdtrack: quasi-dense similarity learning for appearance-only multiple object tracking. IEEE Trans. Pattern Anal. Mach. Intell. 45, 15380\u201315393 (2023)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"3544_CR3","doi-asserted-by":"crossref","unstructured":"Cao, J., Pang, J., Weng, X., Khirodkar, R., Kitani, K.: Observation-centric sort: Rethinking sort for robust multi-object tracking. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pages 9686\u20139696 (2023)","DOI":"10.1109\/CVPR52729.2023.00934"},{"key":"3544_CR4","doi-asserted-by":"crossref","unstructured":"Liu, Y., Zulfikar, I.E., Luiten, J., Dave, A., Ramanan, D., Leibe, B., O\u0161ep, A., Leal-Taix\u00e9 L.: Opening up open world tracking. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pages 19045\u201319055 (2022)","DOI":"10.1109\/CVPR52688.2022.01846"},{"key":"3544_CR5","doi-asserted-by":"crossref","unstructured":"Wu, D., Han, W., Wang, T., Dong, X., Zhang, X., Shen, J.: Referring multi-object tracking. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pages 14633\u201314642 (2023)","DOI":"10.1109\/CVPR52729.2023.01406"},{"key":"3544_CR6","doi-asserted-by":"crossref","unstructured":"Li, S., Fischer, T., Ke, L., Ding, H., Danelljan, M., Yu, F.: Ovtrack: open-vocabulary multiple object tracking. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pages 5567\u20135577 (2023)","DOI":"10.1109\/CVPR52729.2023.00539"},{"key":"3544_CR7","doi-asserted-by":"crossref","unstructured":"Zhang, Y., Sun, P., Jiang, Y., Yu, D., Weng, F., Yuan, Z., Luo, P., Liu, W., Wang, X.: Bytetrack: multi-object tracking by associating every detection box. In European Conference on Computer Vision, pages 1\u201321. Springer (2022)","DOI":"10.1007\/978-3-031-20047-2_1"},{"key":"3544_CR8","doi-asserted-by":"publisher","first-page":"3069","DOI":"10.1007\/s11263-021-01513-4","volume":"129","author":"Y Zhang","year":"2021","unstructured":"Zhang, Y., Wang, C., Wang, X., Zeng, W., Liu, W.: Fairmot: on the fairness of detection and re-identification in multiple object tracking. Int. J. Comput. Vis. 129, 3069\u20133087 (2021)","journal-title":"Int. J. Comput. Vis."},{"key":"3544_CR9","doi-asserted-by":"crossref","unstructured":"Ma, F., Shou, M.Z., Zhu, L., Fan, H., Xu, Y., Yang, Y., Yan, Z.: Unified transformer tracker for object tracking. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pages 8781\u20138790 2022","DOI":"10.1109\/CVPR52688.2022.00858"},{"key":"3544_CR10","doi-asserted-by":"crossref","unstructured":"Geiger, A., Lenz, P., Urtasun, R.: Are we ready for autonomous driving? The kitti vision benchmark suite. In: 2012 IEEE Conference on Computer Vision and Pattern Recognition, pages 3354\u20133361. IEEE (2012)","DOI":"10.1109\/CVPR.2012.6248074"},{"key":"3544_CR11","doi-asserted-by":"publisher","first-page":"845","DOI":"10.1007\/s11263-020-01393-0","volume":"129","author":"P Dendorfer","year":"2021","unstructured":"Dendorfer, P., Osep, A., Milan, A., Schindler, K., Cremers, D., Reid, I., Roth, S., Leal-Taix\u00e9, L.: Motchallenge: a benchmark for single-camera multiple target tracking. Int. J. Comput. Vision 129, 845\u2013881 (2021)","journal-title":"Int. J. Comput. Vision"},{"key":"3544_CR12","doi-asserted-by":"crossref","unstructured":"Dave, A., Khurana, T., Tokmakov, P., Schmid, C., Ramanan, D.: Tao: A large-scale benchmark for tracking any object. In Computer Vision\u2013ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part V 16, pages 436\u2013454. Springer (2020)","DOI":"10.1007\/978-3-030-58558-7_26"},{"key":"3544_CR13","doi-asserted-by":"crossref","unstructured":"Dave, A., Tokmakov, P., Ramanan, D.: Towards segmenting anything that moves. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision Workshops, pages 0 (2019)","DOI":"10.1109\/ICCVW.2019.00187"},{"key":"3544_CR14","unstructured":"Radford, A., Kim, J.W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., Sastry, G., Askell, A., Mishkin, P., Clark, J.: et\u00a0al. Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pages 8748\u20138763. PMLR (2021)"},{"key":"3544_CR15","doi-asserted-by":"crossref","unstructured":"Kirillov, A., Mintun, E., Ravi, N., Mao, H., Rolland, C., Gustafson, L., Xiao, T., Whitehead, S., Berg, A.C., Lo, W.-Y.: et\u00a0al. Segment anything. arXiv preprint arXiv:2304.02643 (2023)","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"3544_CR16","doi-asserted-by":"crossref","unstructured":"Li, L.H., Zhang, P., Zhang, H., Yang, J., Li, C., Zhong, Y., Wang, L., Yuan, L., Zhang, L., Hwang, J.-N.: et\u00a0al. Grounded language-image pre-training. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pages 10965\u201310975 (2022)","DOI":"10.1109\/CVPR52688.2022.01069"},{"key":"3544_CR17","first-page":"1","volume":"28","author":"S Ren","year":"2015","unstructured":"Ren, S., He, K., Girshick, R., Sun, J.: Faster R-CNN: towards real-time object detection with region proposal networks. Adv. Neural Inf. Process. Syst. 28, 1\u20139 (2015)","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"3544_CR18","unstructured":"Liu, S., Li, F., Zhang, H., Yang, X., Qi, X., Su, H., Zhu, J., Zhang, L.: Dab-detr: Dynamic anchor boxes are better queries for detr. arXiv preprint arXiv:2201.12329 (2022)"},{"key":"3544_CR19","doi-asserted-by":"crossref","unstructured":"Li, F., Zhang, H., Liu, S., Guo, J., Ni, L.M., Zhang, L.: Dn-detr: Accelerate detr training by introducing query denoising. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pages 13619\u201313627 (2022)","DOI":"10.1109\/CVPR52688.2022.01325"},{"key":"3544_CR20","unstructured":"Zhang, H., Li, F., Liu, S., Zhang, L., Su, H., Zhu, J., Ni, L.M., Shum, H.-Y.: Dino: Detr with improved denoising anchor boxes for end-to-end object detection. arXiv preprint arXiv:2203.03605 (2022)"},{"key":"3544_CR21","unstructured":"Miao, P., Su, W., Wang, L., Fu, Y., Li, X.: Referring expression comprehension via cross-level multi-modal fusion. arXiv preprint arXiv:2204.09957 (2022)"},{"key":"3544_CR22","doi-asserted-by":"crossref","unstructured":"Chen, Z., Wang, P., Ma, L., Wong K.Y.K., Wu, Q.: Cops-ref: a new dataset and task on compositional referring expression comprehension. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pages 10086\u201310095 (2020)","DOI":"10.1109\/CVPR42600.2020.01010"},{"key":"3544_CR23","doi-asserted-by":"crossref","unstructured":"Subramanian, S., Merrill, W., Darrell, T., Gardner, M., Singh, S., Rohrbach, A.: Reclip: a strong zero-shot baseline for referring expression comprehension. arXiv preprint arXiv:2204.05991 (2022)","DOI":"10.18653\/v1\/2022.acl-long.357"},{"key":"3544_CR24","doi-asserted-by":"crossref","unstructured":"Liu, S., Zeng, Z., Ren, T., Li, F., Zhang, H., Yang, J., Li, C., Yang, J., Su, H., Zhu, J.: et\u00a0al. Grounding dino: marrying dino with grounded pre-training for open-set object detection. arXiv preprint arXiv:2303.05499, (2023)","DOI":"10.1007\/978-3-031-72970-6_3"},{"key":"3544_CR25","doi-asserted-by":"crossref","unstructured":"Zhang, H., Li, F., Zou, X., Liu, S., Li, C., Yang, J., Zhang, L.: A simple framework for open-vocabulary segmentation and detection. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pages 1020\u20131031 (2023)","DOI":"10.1109\/ICCV51070.2023.00100"},{"key":"3544_CR26","doi-asserted-by":"crossref","unstructured":"Li, W., Meng, W., Li, B., Zhang, J., Zhang, X.: Scoot: self-supervised centric open-set object tracking. In: SIGGRAPH Asia 2023 Posters, pages 1\u20132. (2023)","DOI":"10.1145\/3610542.3626130"},{"key":"3544_CR27","unstructured":"Devlin, J., Chang, M.-W., Lee, K., Toutanova, K.B.: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805, (2018)"},{"key":"3544_CR28","doi-asserted-by":"crossref","unstructured":"Liu, Z., Lin, Y., Cao, Y., Hu, H., Wei, Y., Zhang, Z., Lin, S., Guo, B.: Swin transformer: hierarchical vision transformer using shifted windows. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pages 10012\u201310022 (2021)","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"3544_CR29","doi-asserted-by":"crossref","unstructured":"Carion, N., Massa, F., Synnaeve, G., Usunier, N., Kirillov, A., Zagoruyko, S.: Sergey: end-to-end object detection with transformers. In: European Conference on Computer Vision, pages 213\u2013229. Springer (2020)","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"3544_CR30","doi-asserted-by":"crossref","unstructured":"Wang, Z., Zheng, L., Liu, Y., Li, Y., Wang, S.: Towards real-time multi-object tracking. In: European Conference on Computer Vision, pages 107\u2013122. Springer (2020)","DOI":"10.1007\/978-3-030-58621-8_7"},{"key":"3544_CR31","doi-asserted-by":"crossref","unstructured":"Gupta, A., Dollar, P., Girshick, R.: Lvis: A dataset for large vocabulary instance segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pages 5356\u20135364 (2019)","DOI":"10.1109\/CVPR.2019.00550"},{"key":"3544_CR32","doi-asserted-by":"publisher","first-page":"548","DOI":"10.1007\/s11263-020-01375-2","volume":"129","author":"J Luiten","year":"2021","unstructured":"Luiten, J., Osep, A., Dendorfer, P., Torr, P., Geiger, A., Leal-Taix\u00e9, L., Leibe, B.: Hota: a higher order metric for evaluating multi-object tracking. Int. J. Comput. Vision 129, 548\u2013578 (2021)","journal-title":"Int. J. Comput. Vision"},{"key":"3544_CR33","doi-asserted-by":"crossref","unstructured":"Li, S., Danelljan, M., Ding, H., Huang, T.E., Yu, F.: Tracking every thing in the wild. In: European Conference on Computer Vision, pages 498\u2013515. Springer (2022)","DOI":"10.1007\/978-3-031-20047-2_29"},{"key":"3544_CR34","doi-asserted-by":"crossref","unstructured":"Wojke, N., Bewley, A., Paulus, D.: Simple online and realtime tracking with a deep association metric. In: 2017 IEEE International Conference on Image Processing (ICIP), pages 3645\u20133649. IEEE (2017)","DOI":"10.1109\/ICIP.2017.8296962"},{"key":"3544_CR35","doi-asserted-by":"crossref","unstructured":"Bergmann, P., Meinhardt, T., Leal-Taixe, L.: Tracking without bells and whistles. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pages 941\u2013951 (2019)","DOI":"10.1109\/ICCV.2019.00103"},{"key":"3544_CR36","doi-asserted-by":"crossref","unstructured":"Meinhardt, T., Kirillov, A., Leal-Taixe, L., Feichtenhofer, C.: Trackformer: Multi-object tracking with transformers. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pages 8844\u20138854 (2022)","DOI":"10.1109\/CVPR52688.2022.00864"},{"key":"3544_CR37","doi-asserted-by":"crossref","unstructured":"Zhong, Y., Yang, J., Zhang, P., Li, C., Codella, N., Li, L.H., Zhou, L., Dai, X., Yuan, L., Li, Y., Gao, J.: Regionclip: Region-based language-image pretraining. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pages 16793\u201316803 (2022)","DOI":"10.1109\/CVPR52688.2022.01629"}],"container-title":["The Visual Computer"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00371-024-03544-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00371-024-03544-7\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00371-024-03544-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,3,3]],"date-time":"2025-03-03T11:32:34Z","timestamp":1741001554000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00371-024-03544-7"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,6,19]]},"references-count":37,"journal-issue":{"issue":"4","published-print":{"date-parts":[[2025,3]]}},"alternative-id":["3544"],"URL":"https:\/\/doi.org\/10.1007\/s00371-024-03544-7","relation":{},"ISSN":["0178-2789","1432-2315"],"issn-type":[{"type":"print","value":"0178-2789"},{"type":"electronic","value":"1432-2315"}],"subject":[],"published":{"date-parts":[[2024,6,19]]},"assertion":[{"value":"9 June 2024","order":1,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"19 June 2024","order":2,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that they have no known competing financial interests or personal relationships that could have appeared to influence the work reported in this paper.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}