{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,15]],"date-time":"2025-11-15T10:35:37Z","timestamp":1763202937346,"version":"3.40.3"},"publisher-location":"Cham","reference-count":70,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031733895"},{"type":"electronic","value":"9783031733901"}],"license":[{"start":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T00:00:00Z","timestamp":1730332800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T00:00:00Z","timestamp":1730332800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-73390-1_19","type":"book-chapter","created":{"date-parts":[[2024,10,30]],"date-time":"2024-10-30T16:24:01Z","timestamp":1730305441000},"page":"319-337","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":9,"title":["Diff-Tracker: Text-to-Image Diffusion Models are Unsupervised Trackers"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-8956-0095","authenticated-orcid":false,"given":"Zhengbo","family":"Zhang","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1575-5724","authenticated-orcid":false,"given":"Li","family":"Xu","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3281-0772","authenticated-orcid":false,"given":"Duo","family":"Peng","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1920-0371","authenticated-orcid":false,"given":"Hossein","family":"Rahmani","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4365-4165","authenticated-orcid":false,"given":"Jun","family":"Liu","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,10,31]]},"reference":[{"key":"19_CR1","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"850","DOI":"10.1007\/978-3-319-48881-3_56","volume-title":"Computer Vision \u2013 ECCV 2016 Workshops","author":"L Bertinetto","year":"2016","unstructured":"Bertinetto, L., Valmadre, J., Henriques, J.F., Vedaldi, A., Torr, P.H.S.: Fully-convolutional Siamese networks for object tracking. In: Hua, G., J\u00e9gou, H. (eds.) ECCV 2016. LNCS, vol. 9914, pp. 850\u2013865. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-48881-3_56"},{"key":"19_CR2","doi-asserted-by":"crossref","unstructured":"Bhat, G., Danelljan, M., Gool, L.V., Timofte, R.: Learning discriminative model prediction for tracking. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 6182\u20136191 (2019)","DOI":"10.1109\/ICCV.2019.00628"},{"issue":"3","key":"19_CR3","first-page":"289","volume":"14","author":"W Budiharto","year":"2020","unstructured":"Budiharto, W., Irwansyah, E., Suroso, J.S., Gunawan, A.A.S.: Design of object tracking for military robot using PID controller and computer vision. ICIC Express Lett. 14(3), 289\u2013294 (2020)","journal-title":"ICIC Express Lett."},{"issue":"10\u201311","key":"19_CR4","doi-asserted-by":"publisher","first-page":"2764","DOI":"10.1177\/09544070211006520","volume":"235","author":"J Chen","year":"2021","unstructured":"Chen, J., Ai, Y., Qian, Y., Zhang, W.: A novel Siamese attention network for visual object tracking of autonomous vehicles. Proc. Inst. Mech. Eng. Part D J. Autom. Eng. 235(10\u201311), 2764\u20132775 (2021)","journal-title":"Proc. Inst. Mech. Eng. Part D J. Autom. Eng."},{"key":"19_CR5","doi-asserted-by":"crossref","unstructured":"Chen, J., et al.: VideoLLM-online: online video large language model for streaming video. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18407\u201318418 (2024)","DOI":"10.1109\/CVPR52733.2024.01742"},{"key":"19_CR6","doi-asserted-by":"crossref","unstructured":"Cheng, X., Xiong, H., Fan, D.P., Zhong, Y., Harandi, M., Drummond, T., Ge, Z.: Implicit motion handling for video camouflaged object detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 13864\u201313873 (2022)","DOI":"10.1109\/CVPR52688.2022.01349"},{"key":"19_CR7","doi-asserted-by":"crossref","unstructured":"Cui, Y., Jiang, C., Wang, L., Wu, G.: Mixformer: end-to-end tracking with iterative mixed attention. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 13608\u201313618 (2022)","DOI":"10.1109\/CVPR52688.2022.01324"},{"key":"19_CR8","doi-asserted-by":"crossref","unstructured":"Danelljan, M., Bhat, G., Khan, F.S., Felsberg, M.: ATOM: accurate tracking by overlap maximization. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4660\u20134669 (2019)","DOI":"10.1109\/CVPR.2019.00479"},{"key":"19_CR9","doi-asserted-by":"crossref","unstructured":"Danelljan, M., Bhat, G., Shahbaz\u00a0Khan, F., Felsberg, M.: ECO: efficient convolution operators for tracking. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6638\u20136646 (2017)","DOI":"10.1109\/CVPR.2017.733"},{"key":"19_CR10","doi-asserted-by":"crossref","unstructured":"Danelljan, M., Gool, L.V., Timofte, R.: Probabilistic regression for visual tracking. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 7183\u20137192 (2020)","DOI":"10.1109\/CVPR42600.2020.00721"},{"key":"19_CR11","doi-asserted-by":"crossref","unstructured":"Danelljan, M., H\u00e4ger, G., Khan, F., Felsberg, M.: Accurate scale estimation for robust visual tracking. In: British Machine Vision Conference, Nottingham, 1\u20135 September 2014. BMVA Press (2014)","DOI":"10.5244\/C.28.65"},{"key":"19_CR12","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.J., Li, K., Fei-Fei, L.: ImageNet: a large-scale hierarchical image database. In: 2009 IEEE Conference on Computer Vision and Pattern Recognition, pp. 248\u2013255. IEEE (2009)","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"19_CR13","doi-asserted-by":"crossref","unstructured":"Fan, H., et al.: LaSOT: a high-quality benchmark for large-scale single object tracking. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5374\u20135383 (2019)","DOI":"10.1109\/CVPR.2019.00552"},{"key":"19_CR14","doi-asserted-by":"crossref","unstructured":"Fang, J., Li, Z., Xue, J.: Spatial-sequential-spectral context awareness tracking. In: 2017 IEEE International Conference on Image Processing (ICIP), pp. 2582\u20132586. IEEE (2017)","DOI":"10.1109\/ICIP.2017.8296749"},{"key":"19_CR15","doi-asserted-by":"crossref","unstructured":"Foo, L.G., Gong, J., Rahmani, H., Liu, J.: Distribution-aligned diffusion for human mesh recovery. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 9221\u20139232 (2023)","DOI":"10.1109\/ICCV51070.2023.00846"},{"issue":"4","key":"19_CR16","doi-asserted-by":"publisher","first-page":"1612","DOI":"10.1109\/TITS.2019.2930337","volume":"21","author":"M Gao","year":"2019","unstructured":"Gao, M., Jin, L., Jiang, Y., Guo, B.: Manifold Siamese network: a novel visual tracking convnet for autonomous vehicles. IEEE Trans. Intell. Transp. Syst. 21(4), 1612\u20131623 (2019)","journal-title":"IEEE Trans. Intell. Transp. Syst."},{"key":"19_CR17","doi-asserted-by":"crossref","unstructured":"Gong, J., Foo, L.G., Fan, Z., Ke, Q., Rahmani, H., Liu, J.: DiffPose: toward more reliable 3D pose estimation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 13041\u201313051 (2023)","DOI":"10.1109\/CVPR52729.2023.01253"},{"key":"19_CR18","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"issue":"5","key":"19_CR19","doi-asserted-by":"publisher","first-page":"3291","DOI":"10.1109\/TCSVT.2021.3102605","volume":"32","author":"Y He","year":"2021","unstructured":"He, Y., Xu, X., Zhang, J., Shen, F., Yang, Y., Shen, H.T.: Modeling two-stream correspondence for visual sound separation. IEEE Trans. Circuits Syst. Video Technol. 32(5), 3291\u20133302 (2021)","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"issue":"3","key":"19_CR20","doi-asserted-by":"publisher","first-page":"583","DOI":"10.1109\/TPAMI.2014.2345390","volume":"37","author":"JF Henriques","year":"2014","unstructured":"Henriques, J.F., Caseiro, R., Martins, P., Batista, J.: High-speed tracking with kernelized correlation filters. IEEE Trans. Pattern Anal. Mach. Intell. 37(3), 583\u2013596 (2014)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"19_CR21","unstructured":"Hertz, A., Mokady, R., Tenenbaum, J., Aberman, K., Pritch, Y., Cohen-Or, D.: Prompt-to-prompt image editing with cross attention control. arXiv preprint arXiv:2208.01626 (2022)"},{"key":"19_CR22","doi-asserted-by":"crossref","unstructured":"Hu, H., Gu, J., Zhang, Z., Dai, J., Wei, Y.: Relation networks for object detection. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3588\u20133597 (2018)","DOI":"10.1109\/CVPR.2018.00378"},{"issue":"5","key":"19_CR23","doi-asserted-by":"publisher","first-page":"1562","DOI":"10.1109\/TPAMI.2019.2957464","volume":"43","author":"L Huang","year":"2019","unstructured":"Huang, L., Zhao, X., Huang, K.: GOT-10k: a large high-diversity benchmark for generic object tracking in the wild. IEEE Trans. Pattern Anal. Mach. Intell. 43(5), 1562\u20131577 (2019)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"19_CR24","unstructured":"Hui, X., Wu, Q., Rahmani, H., Liu, J.: Class-agnostic object counting with text-to-image diffusion model. In: European Conference on Computer Vision. Springer (2024)"},{"key":"19_CR25","doi-asserted-by":"crossref","unstructured":"Kawar, B., et al.: Imagic: text-based real image editing with diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6007\u20136017 (2023)","DOI":"10.1109\/CVPR52729.2023.00582"},{"key":"19_CR26","doi-asserted-by":"crossref","unstructured":"Khachatryan, L., et al.: Text2video-zero: text-to-image diffusion models are zero-shot video generators. arXiv preprint arXiv:2303.13439 (2023)","DOI":"10.1109\/ICCV51070.2023.01462"},{"key":"19_CR27","unstructured":"Khani, A., Taghanaki, S.A., Sanghi, A., Amiri, A.M., Hamarneh, G.: Slime: segment like me. arXiv preprint arXiv:2309.03179 (2023)"},{"key":"19_CR28","unstructured":"Kingma, D.P., Ba, J.: Adam: a method for stochastic optimization. arXiv preprint arXiv:1412.6980 (2014)"},{"key":"19_CR29","unstructured":"Kristan, M., Leonardis, A., Matas, J., Felsberg, M., Pflugfelder, R., \u010cehovin, L., et\u00a0al: The visual object tracking VOT2016 challenge results. In: Karlinsky, L., Michaeli, T., Nishino, K. (eds.) Computer Vision \u2013 ECCV 2016 Workshops, pp. 777\u2013823. Springer, Cham (2016)"},{"key":"19_CR30","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"3","DOI":"10.1007\/978-3-030-11009-3_1","volume-title":"Computer Vision \u2013 ECCV 2018 Workshops","author":"M Kristan","year":"2019","unstructured":"Kristan, M., et al.: The sixth visual object tracking VOT2018 challenge results. In: Leal-Taix\u00e9, L., Roth, S. (eds.) ECCV 2018. LNCS, vol. 11129, pp. 3\u201353. Springer, Cham (2019). https:\/\/doi.org\/10.1007\/978-3-030-11009-3_1"},{"key":"19_CR31","doi-asserted-by":"crossref","unstructured":"Li, B., Wu, W., Wang, Q., Zhang, F., Xing, J., Yan, J.: SiamRPN++: evolution of Siamese visual tracking with very deep networks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4282\u20134291 (2019)","DOI":"10.1109\/CVPR.2019.00441"},{"key":"19_CR32","doi-asserted-by":"crossref","unstructured":"Li, B., Yan, J., Wu, W., Zhu, Z., Hu, X.: High performance visual tracking with Siamese region proposal network. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 8971\u20138980 (2018)","DOI":"10.1109\/CVPR.2018.00935"},{"key":"19_CR33","first-page":"16743","volume":"35","author":"L Lin","year":"2022","unstructured":"Lin, L., Fan, H., Zhang, Z., Xu, Y., Ling, H.: SwinTrack: a simple and strong baseline for transformer tracking. Adv. Neural. Inf. Process. Syst. 35, 16743\u201316754 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"19_CR34","doi-asserted-by":"crossref","unstructured":"Liu, L., et al.: Learning by analogy: reliable supervision from transformations for unsupervised optical flow estimation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6489\u20136498 (2020)","DOI":"10.1109\/CVPR42600.2020.00652"},{"key":"19_CR35","doi-asserted-by":"crossref","unstructured":"Liu, R., Wu, R., Van\u00a0Hoorick, B., Tokmakov, P., Zakharov, S., Vondrick, C.: Zero-1-to-3: zero-shot one image to 3D object. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 9298\u20139309 (2023)","DOI":"10.1109\/ICCV51070.2023.00853"},{"key":"19_CR36","doi-asserted-by":"crossref","unstructured":"Mayer, C., et al.: Transforming model prediction for tracking. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8731\u20138740 (2022)","DOI":"10.1109\/CVPR52688.2022.00853"},{"key":"19_CR37","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"310","DOI":"10.1007\/978-3-030-01246-5_19","volume-title":"Computer Vision \u2013 ECCV 2018","author":"M M\u00fcller","year":"2018","unstructured":"M\u00fcller, M., Bibi, A., Giancola, S., Alsubaihi, S., Ghanem, B.: TrackingNet: a large-scale dataset and benchmark for object tracking in the wild. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11205, pp. 310\u2013327. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01246-5_19"},{"issue":"1","key":"19_CR38","doi-asserted-by":"publisher","first-page":"14","DOI":"10.1109\/70.210792","volume":"9","author":"NP Papanikolopoulos","year":"1993","unstructured":"Papanikolopoulos, N.P., Khosla, P.K., Kanade, T.: Visual tracking of a moving target by a camera mounted on a robot: a combination of control and vision. IEEE Trans. Robot. Autom. 9(1), 14\u201335 (1993)","journal-title":"IEEE Trans. Robot. Autom."},{"key":"19_CR39","doi-asserted-by":"crossref","unstructured":"Paul, M., Danelljan, M., Mayer, C., Van\u00a0Gool, L.: Robust visual tracking by segmentation. In: European Conference on Computer Vision, pp. 571\u2013588. Springer (2022)","DOI":"10.1007\/978-3-031-20047-2_33"},{"key":"19_CR40","doi-asserted-by":"crossref","unstructured":"Peng, D., Ke, Q., Lei, Y., Liu, J.: Unsupervised domain adaptation via domain-adaptive diffusion. arXiv preprint arXiv:2308.13893 (2023)","DOI":"10.1109\/TIP.2024.3424985"},{"key":"19_CR41","doi-asserted-by":"crossref","unstructured":"Peng, D., Zhang, Z., Hu, P., Ke, Q., Yau, D., Liu, J.: Harnessing text-to-image diffusion models for category-agnostic pose estimation. In: European Conference on Computer Vision. Springer (2024)","DOI":"10.1007\/978-3-031-72624-8_20"},{"key":"19_CR42","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10684\u201310695 (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"19_CR43","doi-asserted-by":"crossref","unstructured":"Ronneberger, O., Fischer, P., Brox, T.: U-net: Convolutional networks for biomedical image segmentation. In: Navab, N., Hornegger, J., Wells, W.M., Frangi, A.F. (eds.) Medical Image Computing and Computer-Assisted Intervention\u2013MICCAI 2015: 18th International Conference, Munich, Germany, October 5-9, 2015, Proceedings, Part III 18, pp. 234\u2013241. Springer (2015)","DOI":"10.1007\/978-3-319-24574-4_28"},{"key":"19_CR44","doi-asserted-by":"publisher","first-page":"211","DOI":"10.1007\/s11263-015-0816-y","volume":"115","author":"O Russakovsky","year":"2015","unstructured":"Russakovsky, O., et al.: ImageNet large scale visual recognition challenge. Int. J. Comput. Vision 115, 211\u2013252 (2015)","journal-title":"Int. J. Comput. Vision"},{"key":"19_CR45","first-page":"36479","volume":"35","author":"C Saharia","year":"2022","unstructured":"Saharia, C., et al.: Photorealistic text-to-image diffusion models with deep language understanding. Adv. Neural. Inf. Process. Syst. 35, 36479\u201336494 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"19_CR46","doi-asserted-by":"crossref","unstructured":"Shen, Q., et al.: Unsupervised learning of accurate Siamese tracking. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8101\u20138110 (2022)","DOI":"10.1109\/CVPR52688.2022.00793"},{"key":"19_CR47","doi-asserted-by":"crossref","unstructured":"Sio, C.H., Ma, Y.J., Shuai, H.H., Chen, J.C., Cheng, W.H.: S2SiamFC: self-supervised fully convolutional Siamese network for visual tracking. In: Proceedings of the 28th ACM International Conference on Multimedia, pp. 1948\u20131957 (2020)","DOI":"10.1145\/3394171.3413611"},{"key":"19_CR48","doi-asserted-by":"crossref","unstructured":"Tan, M., Pang, R., Le, Q.V.: EfficientDet: scalable and efficient object detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10781\u201310790 (2020)","DOI":"10.1109\/CVPR42600.2020.01079"},{"key":"19_CR49","unstructured":"Tang, L., Jia, M., Wang, Q., Phoo, C.P., Hariharan, B.: Emergent correspondence from image diffusion. arXiv preprint arXiv:2306.03881 (2023)"},{"key":"19_CR50","doi-asserted-by":"crossref","unstructured":"Wang, N., Song, Y., Ma, C., Zhou, W., Liu, W., Li, H.: Unsupervised deep tracking. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1308\u20131317 (2019)","DOI":"10.1109\/CVPR.2019.00140"},{"key":"19_CR51","doi-asserted-by":"publisher","first-page":"400","DOI":"10.1007\/s11263-020-01357-4","volume":"129","author":"N Wang","year":"2021","unstructured":"Wang, N., Zhou, W., Song, Y., Ma, C., Liu, W., Li, H.: Unsupervised deep representation learning for real-time tracking. Int. J. Comput. Vision 129, 400\u2013418 (2021)","journal-title":"Int. J. Comput. Vision"},{"key":"19_CR52","doi-asserted-by":"crossref","unstructured":"Wong, B., Chen, J., Wu, Y., Lei, S.W., Mao, D., Gao, D., Shou, M.Z.: Assistq: Affordance-centric question-driven task completion for egocentric assistant. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) European Conference on Computer Vision, pp. 485\u2013501. Springer, Cham (2022)","DOI":"10.1007\/978-3-031-20059-5_28"},{"issue":"9","key":"19_CR53","doi-asserted-by":"publisher","first-page":"1834","DOI":"10.1109\/TPAMI.2014.2388226","volume":"37","author":"Y Wu","year":"2015","unstructured":"Wu, Y., Lim, J., Yang, M.H.: Object tracking benchmark. IEEE Trans. Pattern Anal. Mach. Intell. 37(9), 1834\u20131848 (2015)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"19_CR54","doi-asserted-by":"crossref","unstructured":"Xie, S., Zhang, Z., Lin, Z., Hinz, T., Zhang, K.: Smartbrush: text and shape guided object inpainting with diffusion model. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 22428\u201322437 (2023)","DOI":"10.1109\/CVPR52729.2023.02148"},{"key":"19_CR55","doi-asserted-by":"crossref","unstructured":"Xie, X., Cheng, G., Wang, J., Yao, X., Han, J.: Oriented R-CNN for object detection. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 3520\u20133529 (2021)","DOI":"10.1109\/ICCV48922.2021.00350"},{"key":"19_CR56","doi-asserted-by":"crossref","unstructured":"Xu, L., Huang, H., Liu, J.: SUTD-TrafficQA: a question answering benchmark and an efficient network for video reasoning over traffic events. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9878\u20139888 (2021)","DOI":"10.1109\/CVPR46437.2021.00975"},{"key":"19_CR57","doi-asserted-by":"crossref","unstructured":"Xu, L., Huang, M.H., Shang, X., Yuan, Z., Sun, Y., Liu, J.: Meta compositional referring expression segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 19478\u201319487 (2023)","DOI":"10.1109\/CVPR52729.2023.01866"},{"key":"19_CR58","doi-asserted-by":"crossref","unstructured":"Xu, N., et al: YouTube-VOS: sequence-to-sequence video object segmentation. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 585\u2013601. Springer (2018)","DOI":"10.1007\/978-3-030-01228-1_36"},{"key":"19_CR59","doi-asserted-by":"crossref","unstructured":"Yan, B., Peng, H., Fu, J., Wang, D., Lu, H.: Learning spatio-temporal transformer for visual tracking. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 10448\u201310457 (2021)","DOI":"10.1109\/ICCV48922.2021.01028"},{"key":"19_CR60","doi-asserted-by":"publisher","unstructured":"Ye, B., Chang, H., Ma, B., Shan, S., Chen, X.: Joint feature learning and relation modeling for tracking: a one-stream framework. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds) European Conference on Computer Vision, pp. 341\u2013357. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20047-2_20","DOI":"10.1007\/978-3-031-20047-2_20"},{"key":"19_CR61","doi-asserted-by":"crossref","unstructured":"Yu, C., Wang, J., Peng, C., Gao, C., Yu, G., Sang, N.: BiseNet: bilateral segmentation network for real-time semantic segmentation. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 325\u2013341 (2018)","DOI":"10.1007\/978-3-030-01261-8_20"},{"key":"19_CR62","doi-asserted-by":"crossref","unstructured":"Yu, Y., Xiong, Y., Huang, W., Scott, M.R.: Deformable Siamese attention networks for visual object tracking. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6728\u20136737 (2020)","DOI":"10.1109\/CVPR42600.2020.00676"},{"key":"19_CR63","doi-asserted-by":"publisher","first-page":"976","DOI":"10.1109\/TIP.2020.3037518","volume":"30","author":"D Yuan","year":"2020","unstructured":"Yuan, D., Chang, X., Huang, P.Y., Liu, Q., He, Z.: Self-supervised deep correlation tracking. IEEE Trans. Image Process. 30, 976\u2013985 (2020)","journal-title":"IEEE Trans. Image Process."},{"key":"19_CR64","doi-asserted-by":"crossref","unstructured":"Zhang, Z., Zhou, C., Tu, Z.: Distilling inter-class distance for semantic segmentation. arXiv preprint arXiv:2205.03650 (2022)","DOI":"10.24963\/ijcai.2022\/235"},{"key":"19_CR65","unstructured":"Zhang, Z., Zhou, Y., Gong, J., Liu, J., Tu, Z.: Instance temperature knowledge distillation. arXiv preprint arXiv:2407.00115 (2024)"},{"key":"19_CR66","doi-asserted-by":"crossref","unstructured":"Zhao, Q., Dai, Y., Li, H., Hu, W., Zhang, F., Liu, J.: LTGC: long-tail recognition via leveraging LLMS-driven generated content. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 19510\u201319520 (2024)","DOI":"10.1109\/CVPR52733.2024.01845"},{"key":"19_CR67","unstructured":"Zhao, Q., Huang, Y., Hu, W., Zhang, F., Liu, J.: MixPro: data augmentation with maskmix and progressive attention labeling for vision transformer. arXiv preprint arXiv:2304.12043 (2023)"},{"key":"19_CR68","doi-asserted-by":"crossref","unstructured":"Zheng, J., Ma, C., Peng, H., Yang, X.: Learning to track objects from unlabeled videos. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 13546\u201313555 (2021)","DOI":"10.1109\/ICCV48922.2021.01329"},{"issue":"12","key":"19_CR69","doi-asserted-by":"publisher","first-page":"5140","DOI":"10.1109\/TIP.2015.2479460","volume":"24","author":"G Zhu","year":"2015","unstructured":"Zhu, G., Wang, J., Zhao, C., Lu, H.: Weighted part context learning for visual tracking. IEEE Trans. Image Process. 24(12), 5140\u20135151 (2015)","journal-title":"IEEE Trans. Image Process."},{"key":"19_CR70","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"103","DOI":"10.1007\/978-3-030-01240-3_7","volume-title":"Computer Vision \u2013 ECCV 2018","author":"Z Zhu","year":"2018","unstructured":"Zhu, Z., Wang, Q., Li, B., Wu, W., Yan, J., Hu, W.: Distractor-aware Siamese networks for visual object tracking. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11213, pp. 103\u2013119. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01240-3_7"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-73390-1_19","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,30]],"date-time":"2024-10-30T16:35:32Z","timestamp":1730306132000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-73390-1_19"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,31]]},"ISBN":["9783031733895","9783031733901"],"references-count":70,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-73390-1_19","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,10,31]]},"assertion":[{"value":"31 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}