{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,12]],"date-time":"2025-11-12T14:17:09Z","timestamp":1762957029299,"version":"3.40.3"},"publisher-location":"Cham","reference-count":107,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031198052"},{"type":"electronic","value":"9783031198069"}],"license":[{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022]]},"DOI":"10.1007\/978-3-031-19806-9_14","type":"book-chapter","created":{"date-parts":[[2022,10,19]],"date-time":"2022-10-19T23:11:54Z","timestamp":1666221114000},"page":"238-258","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":6,"title":["Bridging Images and\u00a0Videos: A Simple Learning Framework for\u00a0Large Vocabulary Video Object Detection"],"prefix":"10.1007","author":[{"given":"Sanghyun","family":"Woo","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Kwanyong","family":"Park","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Seoung Wug","family":"Oh","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"In So","family":"Kweon","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Joon-Young","family":"Lee","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2022,10,20]]},"reference":[{"key":"14_CR1","unstructured":"Aljundi, R., Lin, M., Goujaud, B., Bengio, Y.: Gradient based sample selection for online continual learning. arXiv:1903.08671 (2019)"},{"issue":"3","key":"14_CR2","doi-asserted-by":"publisher","first-page":"24","DOI":"10.1145\/1531326.1531330","volume":"28","author":"C Barnes","year":"2009","unstructured":"Barnes, C., Shechtman, E., Finkelstein, A., Goldman, D.B.: PatchMatch: a randomized correspondence algorithm for structural image editing. ACM Trans. Graph. 28(3), 24 (2009)","journal-title":"ACM Trans. Graph."},{"key":"14_CR3","doi-asserted-by":"crossref","unstructured":"Bergmann, P., Meinhardt, T., Leal-Taixe, L.: Tracking without bells and whistles. In: ICCV, pp. 941\u2013951 (2019)","DOI":"10.1109\/ICCV.2019.00103"},{"key":"14_CR4","doi-asserted-by":"crossref","unstructured":"Bewley, A., Ge, Z., Ott, L., Ramos, F., Upcroft, B.: Simple online and realtime tracking. In: ICIP, pp. 3464\u20133468 (2016)","DOI":"10.1109\/ICIP.2016.7533003"},{"key":"14_CR5","unstructured":"Bochkovskiy, A., Wang, C.Y., Liao, H.Y.M.: Yolov4: optimal speed and accuracy of object detection. arXiv:2004.10934 (2020)"},{"key":"14_CR6","doi-asserted-by":"crossref","unstructured":"Cai, Z., Vasconcelos, N.: Cascade R-CNN: delving into high quality object detection. In: CVPR, pp. 6154\u20136162 (2018)","DOI":"10.1109\/CVPR.2018.00644"},{"key":"14_CR7","doi-asserted-by":"crossref","unstructured":"Cai, Z., Vasconcelos, N.: Cascade R-CNN: high quality object detection and instance segmentation. PAMI 43, 1483\u20131498 (2019)","DOI":"10.1109\/TPAMI.2019.2956516"},{"key":"14_CR8","unstructured":"Chang, N., Yu, Z., Wang, Y.X., Anandkumar, A., Fidler, S., Alvarez, J.M.: Image-level or object-level? A tale of two resampling strategies for long-tailed detection. arXiv:2104.05702 (2021)"},{"key":"14_CR9","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"556","DOI":"10.1007\/978-3-030-01252-6_33","volume-title":"Computer Vision \u2013 ECCV 2018","author":"A Chaudhry","year":"2018","unstructured":"Chaudhry, A., Dokania, P.K., Ajanthan, T., Torr, P.H.S.: Riemannian walk for incremental learning: understanding forgetting and intransigence. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11215, pp. 556\u2013572. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01252-6_33"},{"key":"14_CR10","doi-asserted-by":"crossref","unstructured":"Chen, K., et al.: Hybrid task cascade for instance segmentation. In: CVPR, pp. 4974\u20134983 (2019)","DOI":"10.1109\/CVPR.2019.00511"},{"key":"14_CR11","unstructured":"Dave, A., Doll\u00e1r, P., Ramanan, D., Kirillov, A., Girshick, R.: Evaluating large-vocabulary object detectors: The devil is in the details. arXiv:2102.01066 (2021)"},{"key":"14_CR12","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"436","DOI":"10.1007\/978-3-030-58558-7_26","volume-title":"Computer Vision \u2013 ECCV 2020","author":"A Dave","year":"2020","unstructured":"Dave, A., Khurana, T., Tokmakov, P., Schmid, C., Ramanan, D.: TAO: a large-scale benchmark for tracking any object. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12350, pp. 436\u2013454. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58558-7_26"},{"key":"14_CR13","doi-asserted-by":"crossref","unstructured":"Feichtenhofer, C., Pinz, A., Zisserman, A.: Detect to track and track to detect. In: ICCV, pp. 3038\u20133046 (2017)","DOI":"10.1109\/ICCV.2017.330"},{"key":"14_CR14","doi-asserted-by":"crossref","unstructured":"Fu, Y., Liu, S., Iqbal, U., De Mello, S., Shi, H., Kautz, J.: Learning to track instances without video annotations. In: CVPR, pp. 8680\u20138689 (2021)","DOI":"10.1109\/CVPR46437.2021.00857"},{"key":"14_CR15","doi-asserted-by":"crossref","unstructured":"Ghiasi, G., et al.: Simple copy-paste is a strong data augmentation method for instance segmentation. In: CVPR, pp. 2918\u20132928 (2021)","DOI":"10.1109\/CVPR46437.2021.00294"},{"key":"14_CR16","doi-asserted-by":"crossref","unstructured":"Gupta, A., Dollar, P., Girshick, R.: LVIS: a dataset for large vocabulary instance segmentation. In: CVPR, pp. 5356\u20135364 (2019)","DOI":"10.1109\/CVPR.2019.00550"},{"issue":"9","key":"14_CR17","doi-asserted-by":"publisher","first-page":"1263","DOI":"10.1109\/TKDE.2008.239","volume":"21","author":"H He","year":"2009","unstructured":"He, H., Garcia, E.A.: Learning from imbalanced data. IEEE Trans. Knowl. Data Eng. 21(9), 1263\u20131284 (2009)","journal-title":"IEEE Trans. Knowl. Data Eng."},{"key":"14_CR18","doi-asserted-by":"crossref","unstructured":"He, K., Gkioxari, G., Doll\u00e1r, P., Girshick, R.: Mask R-CNN. In: ICCV, pp. 2961\u20132969 (2017)","DOI":"10.1109\/ICCV.2017.322"},{"key":"14_CR19","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: CVPR, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"14_CR20","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"749","DOI":"10.1007\/978-3-319-46448-0_45","volume-title":"Computer Vision \u2013 ECCV 2016","author":"D Held","year":"2016","unstructured":"Held, D., Thrun, S., Savarese, S.: Learning to track at 100 FPS with deep regression networks. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9905, pp. 749\u2013765. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46448-0_45"},{"key":"14_CR21","unstructured":"Hinton, G., Vinyals, O., Dean, J.: Distilling the knowledge in a neural network. arXiv:1503.02531 (2015)"},{"key":"14_CR22","doi-asserted-by":"crossref","unstructured":"Hsieh, T.I., Robb, E., Chen, H.T., Huang, J.B.: Droploss for long-tail instance segmentation. arXiv:2104.06402 (2021)","DOI":"10.1609\/aaai.v35i2.16246"},{"key":"14_CR23","doi-asserted-by":"crossref","unstructured":"Hu, X., Jiang, Y., Tang, K., Chen, J., Miao, C., Zhang, H.: Learning to segment the tail. In: CVPR, pp. 14045\u201314054 (2020)","DOI":"10.1109\/CVPR42600.2020.01406"},{"key":"14_CR24","unstructured":"Kang, B., et al.: Decoupling representation and classifier for long-tailed recognition. arXiv:1910.09217 (2019)"},{"key":"14_CR25","doi-asserted-by":"crossref","unstructured":"Kim, C., Li, F., Ciptadi, A., Rehg, J.M.: Multiple hypothesis tracking revisited. In: ICCV, pp. 4696\u20134704 (2015)","DOI":"10.1109\/ICCV.2015.533"},{"key":"14_CR26","doi-asserted-by":"crossref","unstructured":"Kim, D., Woo, S., Lee, J.Y., Kweon, I.S.: Deep video inpainting. In: CVPR, pp. 5792\u20135801 (2019)","DOI":"10.1109\/CVPR.2019.00594"},{"key":"14_CR27","doi-asserted-by":"crossref","unstructured":"Kim, D., Woo, S., Lee, J.Y., Kweon, I.S.: Video panoptic segmentation. In: CVPR, pp. 9859\u20139868 (2020)","DOI":"10.1109\/CVPR42600.2020.00988"},{"key":"14_CR28","doi-asserted-by":"crossref","unstructured":"Kim, T., Oh, J., Kim, N., Cho, S., Yun, S.Y.: Comparing Kullback-Leibler divergence and mean squared error loss in knowledge distillation. arXiv:2105.08919 (2021)","DOI":"10.24963\/ijcai.2021\/362"},{"issue":"13","key":"14_CR29","doi-asserted-by":"publisher","first-page":"3521","DOI":"10.1073\/pnas.1611835114","volume":"114","author":"J Kirkpatrick","year":"2017","unstructured":"Kirkpatrick, J., et al.: Overcoming catastrophic forgetting in neural networks. Proc. Natl. Acad. Sci. 114(13), 3521\u20133526 (2017)","journal-title":"Proc. Natl. Acad. Sci."},{"key":"14_CR30","doi-asserted-by":"crossref","unstructured":"Kuznetsova, A., Ju Hwang, S., Rosenhahn, B., Sigal, L.: Expanding object detector\u2019s horizon: Incremental learning framework for object detection in videos. In: CVPR, pp. 28\u201336 (2015)","DOI":"10.1109\/CVPR.2015.7298597"},{"key":"14_CR31","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"179","DOI":"10.1007\/978-3-030-01267-0_11","volume-title":"Computer Vision \u2013 ECCV 2018","author":"W-S Lai","year":"2018","unstructured":"Lai, W.-S., Huang, J.-B., Wang, O., Shechtman, E., Yumer, E., Yang, M.-H.: Learning blind video temporal consistency. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11219, pp. 179\u2013195. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01267-0_11"},{"key":"14_CR32","doi-asserted-by":"crossref","unstructured":"Lai, Z., Lu, E., Xie, W.: MAST: a memory-augmented self-supervised tracker. In: CVPR, pp. 6479\u20136488 (2020)","DOI":"10.1109\/CVPR42600.2020.00651"},{"key":"14_CR33","unstructured":"Lai, Z., Xie, W.: Self-supervised learning for video correspondence flow. arXiv:1905.00875 (2019)"},{"key":"14_CR34","doi-asserted-by":"crossref","unstructured":"Leal-Taix\u00e9, L., Canton-Ferrer, C., Schindler, K.: Learning by tracking: siamese CNN for robust target association. In: CVPR Workshops, pp. 33\u201340 (2016)","DOI":"10.1109\/CVPRW.2016.59"},{"key":"14_CR35","unstructured":"Leal-Taix\u00e9, L., Milan, A., Schindler, K., Cremers, D., Reid, I., Roth, S.: Tracking the trackers: an analysis of the state of the art in multiple object tracking. arXiv:1704.02781 (2017)"},{"key":"14_CR36","unstructured":"Lee, S.W., Kim, J.H., Jun, J., Ha, J.W., Zhang, B.T.: Overcoming catastrophic forgetting by incremental moment matching. arXiv:1703.08475 (2017)"},{"key":"14_CR37","unstructured":"Lei, C., Xing, Y., Chen, Q.: Blind video temporal consistency via deep video prior. In: Advances in Neural Information Processing Systems 33 (2020)"},{"key":"14_CR38","unstructured":"Li, X., Liu, S., De Mello, S., Wang, X., Kautz, J., Yang, M.H.: Joint-task self-supervised learning for temporal correspondence. arXiv:1909.11895 (2019)"},{"key":"14_CR39","doi-asserted-by":"crossref","unstructured":"Li, Y., Wang, T., Kang, B., Tang, S., Wang, C., Li, J., Feng, J.: Overcoming classifier imbalance for long-tail object detection with balanced group softmax. In: CVPR, pp. 10991\u201311000 (2020)","DOI":"10.1109\/CVPR42600.2020.01100"},{"key":"14_CR40","doi-asserted-by":"crossref","unstructured":"Lin, T.Y., Doll\u00e1r, P., Girshick, R., He, K., Hariharan, B., Belongie, S.: Feature pyramid networks for object detection. In: CVPR, pp. 2117\u20132125 (2017)","DOI":"10.1109\/CVPR.2017.106"},{"key":"14_CR41","doi-asserted-by":"crossref","unstructured":"Liu, Y., Zulfikar, I.E., et al.: Opening up open-world tracking. arXiv:2104.11221 (2021)","DOI":"10.1109\/CVPR52688.2022.01846"},{"key":"14_CR42","doi-asserted-by":"crossref","unstructured":"Liu, Z., Miao, Z., Zhan, X., Wang, J., Gong, B., Yu, S.X.: Large-scale long-tailed recognition in an open world. In: CVPR, pp. 2537\u20132546 (2019)","DOI":"10.1109\/CVPR.2019.00264"},{"key":"14_CR43","doi-asserted-by":"crossref","unstructured":"Lu, Z., Rathod, V., Votel, R., Huang, J.: RetinaTrack: online single stage joint detection and tracking. In: CVPR, pp. 14668\u201314678 (2020)","DOI":"10.1109\/CVPR42600.2020.01468"},{"key":"14_CR44","unstructured":"Manning, C., Schutze, H.: Foundations of Statistical Natural Language Processing. MIT Press, Cambridge (1999)"},{"key":"14_CR45","doi-asserted-by":"crossref","unstructured":"McCloskey, M., Cohen, N.J.: Catastrophic interference in connectionist networks: the sequential learning problem. Psychol. Learn. Motiv. 24, 109\u2013165 (1989)","DOI":"10.1016\/S0079-7421(08)60536-8"},{"key":"14_CR46","doi-asserted-by":"crossref","unstructured":"Meinhardt, T., Kirillov, A., Leal-Taixe, L., Feichtenhofer, C.: TrackFormer: multi-object tracking with transformers. arXiv:2101.02702 (2021)","DOI":"10.1109\/CVPR52688.2022.00864"},{"key":"14_CR47","doi-asserted-by":"crossref","unstructured":"Milan, A., Rezatofighi, S.H., Dick, A., Reid, I., Schindler, K.: Online multi-target tracking using recurrent neural networks. In: Thirty-First AAAI Conference on Artificial Intelligence (2017)","DOI":"10.1609\/aaai.v31i1.11194"},{"key":"14_CR48","doi-asserted-by":"crossref","unstructured":"Oh, S.W., Lee, J.Y., Sunkavalli, K., Kim, S.J.: Fast video object segmentation by reference-guided mask propagation. In: CVPR, pp. 7376\u20137385 (2018)","DOI":"10.1109\/CVPR.2018.00770"},{"key":"14_CR49","doi-asserted-by":"crossref","unstructured":"Oh, S.W., Lee, J.Y., Xu, N., Kim, S.J.: Video object segmentation using space-time memory networks. In: ICCV, pp. 9226\u20139235 (2019)","DOI":"10.1109\/ICCV.2019.00932"},{"key":"14_CR50","unstructured":"Pan, T.Y., et al.: On model calibration for long-tailed object detection and instance segmentation. arXiv:2107.02170 (2021)"},{"key":"14_CR51","doi-asserted-by":"crossref","unstructured":"Pang, J., et al.: Quasi-dense similarity learning for multiple object tracking. In: CVPR, pp. 164\u2013173 (2021)","DOI":"10.1109\/CVPR46437.2021.00023"},{"key":"14_CR52","doi-asserted-by":"crossref","unstructured":"Park, K., Woo, S., Oh, S.W., Kweon, I.S., Lee, J.Y.: Per-clip video object segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1352\u20131361 (2022)","DOI":"10.1109\/CVPR52688.2022.00141"},{"key":"14_CR53","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"145","DOI":"10.1007\/978-3-030-58548-8_9","volume-title":"Computer Vision \u2013 ECCV 2020","author":"J Peng","year":"2020","unstructured":"Peng, J., et al.: Chained-tracker: chaining paired attentive regression results for end-to-end joint multiple-object detection and tracking. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12349, pp. 145\u2013161. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58548-8_9"},{"key":"14_CR54","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"524","DOI":"10.1007\/978-3-030-58536-5_31","volume-title":"Computer Vision \u2013 ECCV 2020","author":"A Prabhu","year":"2020","unstructured":"Prabhu, A., Torr, P.H.S., Dokania, P.K.: GDumb: a simple approach that questions our progress in continual learning. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12347, pp. 524\u2013540. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58536-5_31"},{"key":"14_CR55","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"262","DOI":"10.1007\/978-3-030-58574-7_16","volume-title":"Computer Vision \u2013 ECCV 2020","author":"S Purushwalkam","year":"2020","unstructured":"Purushwalkam, S., Ye, T., Gupta, S., Gupta, A.: Aligning videos in space and time. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12371, pp. 262\u2013278. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58574-7_16"},{"key":"14_CR56","doi-asserted-by":"crossref","unstructured":"Ramanan, D., Forsyth, D.A.: Finding and tracking people from the bottom up. In: CVPR, vol. 2, pp. II\u2013II. IEEE (2003)","DOI":"10.1109\/CVPR.2003.1211504"},{"key":"14_CR57","doi-asserted-by":"crossref","unstructured":"Rebuffi, S.A., Kolesnikov, A., Sperl, G., Lampert, C.H.: iCaRL: incremental classifier and representation learning. In: CVPR, pp. 2001\u20132010 (2017)","DOI":"10.1109\/CVPR.2017.587"},{"key":"14_CR58","unstructured":"Ren, J., et al.: Balanced meta-softmax for long-tailed visual recognition. arXiv:2007.10740 (2020)"},{"key":"14_CR59","unstructured":"Ren, S., He, K., Girshick, R., Sun, J.: Faster R-CNN: towards real-time object detection with region proposal networks. In: NIPS, vol. 28, pp. 91\u201399 (2015)"},{"key":"14_CR60","unstructured":"Riloff, E.: Automatically generating extraction patterns from untagged text. In: Proceedings of the National Conference on Artificial Intelligence, pp. 1044\u20131049 (1996)"},{"key":"14_CR61","doi-asserted-by":"crossref","unstructured":"Riloff, E., Wiebe, J.: Learning extraction patterns for subjective expressions. In: Proceedings of the 2003 Conference on Empirical Methods in Natural Language Processing, pp. 105\u2013112 (2003)","DOI":"10.3115\/1119355.1119369"},{"key":"14_CR62","doi-asserted-by":"crossref","unstructured":"Sadeghian, A., Alahi, A., Savarese, S.: Tracking the untrackable: learning to track multiple cues with long-term dependencies. In: ICCV, pp. 300\u2013311 (2017)","DOI":"10.1109\/ICCV.2017.41"},{"issue":"3","key":"14_CR63","doi-asserted-by":"publisher","first-page":"363","DOI":"10.1109\/TIT.1965.1053799","volume":"11","author":"H Scudder","year":"1965","unstructured":"Scudder, H.: Probability of error of some adaptive pattern-recognition machines. IEEE Trans. Inf. Theory 11(3), 363\u2013371 (1965)","journal-title":"IEEE Trans. Inf. Theory"},{"key":"14_CR64","unstructured":"Shin, H., Lee, J.K., Kim, J., Kim, J.: Continual learning with deep generative replay. arXiv:1705.08690 (2017)"},{"key":"14_CR65","doi-asserted-by":"crossref","unstructured":"Shmelkov, K., Schmid, C., Alahari, K.: Incremental learning of object detectors without catastrophic forgetting. In: ICCV, pp. 3400\u20133409 (2017)","DOI":"10.1109\/ICCV.2017.368"},{"key":"14_CR66","doi-asserted-by":"crossref","unstructured":"Sio, C.H., Ma, Y.J., Shuai, H.H., Chen, J.C., Cheng, W.H.: S2SiamFC: self-supervised fully convolutional siamese network for visual tracking. In: Proceedings of ACM International Conference on Multimedia, pp. 1948\u20131957 (2020)","DOI":"10.1145\/3394171.3413611"},{"key":"14_CR67","unstructured":"Sohn, K., Zhang, Z., Li, C.L., Zhang, H., Lee, C.Y., Pfister, T.: A simple semi-supervised learning framework for object detection. arXiv preprint arXiv:2005.04757 (2020)"},{"key":"14_CR68","doi-asserted-by":"crossref","unstructured":"Son, J., Baek, M., Cho, M., Han, B.: Multi-object tracking with quadruplet convolutional neural networks. In: CVPR, pp. 5620\u20135629 (2017)","DOI":"10.1109\/CVPR.2017.403"},{"key":"14_CR69","unstructured":"Sun, P., et al.: Transtrack: multiple-object tracking with transformer. arXiv:2012.15460 (2020)"},{"key":"14_CR70","doi-asserted-by":"crossref","unstructured":"Tan, J., Lu, X., Zhang, G., Yin, C., Li, Q.: Equalization loss V2: a new gradient balance approach for long-tailed object detection. In: CVPR, pp. 1685\u20131694 (2021)","DOI":"10.1109\/CVPR46437.2021.00173"},{"key":"14_CR71","doi-asserted-by":"crossref","unstructured":"Tan, J., et al.: Equalization loss for long-tailed object recognition. In: CVPR, pp. 11662\u201311671 (2020)","DOI":"10.1109\/CVPR42600.2020.01168"},{"key":"14_CR72","doi-asserted-by":"crossref","unstructured":"Tang, Y., Chen, W., Luo, Y., Zhang, Y.: Humble teachers teach better students for semi-supervised object detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3132\u20133141 (2021)","DOI":"10.1109\/CVPR46437.2021.00315"},{"key":"14_CR73","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"402","DOI":"10.1007\/978-3-030-01261-8_24","volume-title":"Computer Vision \u2013 ECCV 2018","author":"C Vondrick","year":"2018","unstructured":"Vondrick, C., Shrivastava, A., Fathi, A., Guadarrama, S., Murphy, K.: Tracking emerges by colorizing videos. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11217, pp. 402\u2013419. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01261-8_24"},{"key":"14_CR74","unstructured":"Vu, T., Jang, H., Pham, T.X., Yoo, C.D.: Cascade RPN: delving into high-quality region proposal network with adaptive convolution. arXiv:1909.06720 (2019)"},{"key":"14_CR75","doi-asserted-by":"crossref","unstructured":"Wang, J., Wang, X., Shang-Guan, Y., Gupta, A.: Wanderlust: online continual object detection in the real world. In: ICCV, pp. 10829\u201310838 (2021)","DOI":"10.1109\/ICCV48922.2021.01065"},{"key":"14_CR76","doi-asserted-by":"crossref","unstructured":"Wang, J., et al.: Seesaw loss for long-tailed instance segmentation. In: CVPR, pp. 9695\u20139704 (2021)","DOI":"10.1109\/CVPR46437.2021.00957"},{"key":"14_CR77","doi-asserted-by":"crossref","unstructured":"Wang, N., Song, Y., Ma, C., Zhou, W., Liu, W., Li, H.: Unsupervised deep tracking. In: CVPR, pp. 1308\u20131317 (2019)","DOI":"10.1109\/CVPR.2019.00140"},{"key":"14_CR78","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"728","DOI":"10.1007\/978-3-030-58568-6_43","volume-title":"Computer Vision \u2013 ECCV 2020","author":"T Wang","year":"2020","unstructured":"Wang, T., et al.: The devil is in classification: a simple framework for long-tail instance segmentation. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12359, pp. 728\u2013744. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58568-6_43"},{"key":"14_CR79","doi-asserted-by":"crossref","unstructured":"Wang, T., Zhu, Y., Zhao, C., Zeng, W., Wang, J., Tang, M.: Adaptive class suppression loss for long-tail object detection. In: CVPR, pp. 3103\u20133112 (2021)","DOI":"10.1109\/CVPR46437.2021.00312"},{"key":"14_CR80","doi-asserted-by":"crossref","unstructured":"Wang, W., Feiszli, M., Wang, H., Tran, D.: Unidentified video objects: a benchmark for dense, open-world segmentation. arXiv:2104.04691 (2021)","DOI":"10.1109\/ICCV48922.2021.01060"},{"key":"14_CR81","doi-asserted-by":"crossref","unstructured":"Wang, X., Jabri, A., Efros, A.A.: Learning correspondence from the cycle-consistency of time. In: CVPR, pp. 2566\u20132576 (2019)","DOI":"10.1109\/CVPR.2019.00267"},{"key":"14_CR82","unstructured":"Wang, X., Huang, T.E., Darrell, T., Gonzalez, J.E., Yu, F.: Frustratingly simple few-shot object detection. arXiv:2003.06957 (2020)"},{"key":"14_CR83","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"107","DOI":"10.1007\/978-3-030-58621-8_7","volume-title":"Computer Vision \u2013 ECCV 2020","author":"Z Wang","year":"2020","unstructured":"Wang, Z., Zheng, L., Liu, Y., Li, Y., Wang, S.: Towards real-time multi-object tracking. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12356, pp. 107\u2013122. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58621-8_7"},{"key":"14_CR84","doi-asserted-by":"crossref","unstructured":"Wojke, N., Bewley, A., Paulus, D.: Simple online and realtime tracking with a deep association metric. In: ICIP, pp. 3645\u20133649. IEEE (2017)","DOI":"10.1109\/ICIP.2017.8296962"},{"key":"14_CR85","doi-asserted-by":"crossref","unstructured":"Wu, J., Cao, J., Song, L., Wang, Y., Yang, M., Yuan, J.: Track to detect and segment: an online multi-object tracker. In: CVPR, pp. 12352\u201312361 (2021)","DOI":"10.1109\/CVPR46437.2021.01217"},{"key":"14_CR86","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"162","DOI":"10.1007\/978-3-030-58548-8_10","volume-title":"Computer Vision \u2013 ECCV 2020","author":"T Wu","year":"2020","unstructured":"Wu, T., Huang, Q., Liu, Z., Wang, Yu., Lin, D.: Distribution-balanced loss for multi-label classification in long-tailed datasets. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12349, pp. 162\u2013178. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58548-8_10"},{"key":"14_CR87","doi-asserted-by":"crossref","unstructured":"Wu, Y., et al.: Large scale incremental learning. In: CVPR, pp. 374\u2013382 (2019)","DOI":"10.1109\/CVPR.2019.00046"},{"key":"14_CR88","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"472","DOI":"10.1007\/978-3-030-01231-1_29","volume-title":"Computer Vision \u2013 ECCV 2018","author":"B Xiao","year":"2018","unstructured":"Xiao, B., Wu, H., Wei, Y.: Simple baselines for human pose estimation and tracking. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11210, pp. 472\u2013487. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01231-1_29"},{"key":"14_CR89","doi-asserted-by":"crossref","unstructured":"Xie, Q., Luong, M.T., Hovy, E., Le, Q.V.: Self-training with noisy student improves ImageNet classification. In: CVPR, pp. 10687\u201310698 (2020)","DOI":"10.1109\/CVPR42600.2020.01070"},{"key":"14_CR90","doi-asserted-by":"crossref","unstructured":"Xu, J., Wang, X.: Rethinking self-supervised correspondence learning: a video frame-level similarity perspective. arXiv:2103.17263 (2021)","DOI":"10.1109\/ICCV48922.2021.00992"},{"key":"14_CR91","doi-asserted-by":"crossref","unstructured":"Xu, M., et al.: End-to-end semi-supervised object detection with soft teacher. In: ICCV, pp. 3060\u20133069 (2021)","DOI":"10.1109\/ICCV48922.2021.00305"},{"key":"14_CR92","unstructured":"Xu, M., et al.: Bootstrap your object detector via mixed training 34 (2021)"},{"key":"14_CR93","doi-asserted-by":"crossref","unstructured":"Yang, L., Fan, Y., Xu, N.: Video instance segmentation. In: ICCV, pp. 5188\u20135197 (2019)","DOI":"10.1109\/ICCV.2019.00529"},{"key":"14_CR94","doi-asserted-by":"crossref","unstructured":"Yun, S., Han, D., Oh, S.J., Chun, S., Choe, J., Yoo, Y.: CutMix: regularization strategy to train strong classifiers with localizable features. In: ICCV, pp. 6023\u20136032 (2019)","DOI":"10.1109\/ICCV.2019.00612"},{"key":"14_CR95","doi-asserted-by":"crossref","unstructured":"Zang, Y., Huang, C., Loy, C.C.: FASA: feature augmentation and sampling adaptation for long-tailed instance segmentation. arXiv:2102.12867 (2021)","DOI":"10.1109\/ICCV48922.2021.00344"},{"key":"14_CR96","doi-asserted-by":"crossref","unstructured":"Zeng, F., Dong, B., Wang, T., Zhang, X., Wei, Y.: MOTR: end-to-end multiple-object tracking with transformer. arXiv:2105.03247 (2021)","DOI":"10.1007\/978-3-031-19812-0_38"},{"key":"14_CR97","doi-asserted-by":"crossref","unstructured":"Zhang, C., et al.: Mosaicos: A simple and effective use of object-centric images for long-tailed object detection. arXiv:2102.08884 (2021)","DOI":"10.1109\/ICCV48922.2021.00047"},{"key":"14_CR98","doi-asserted-by":"crossref","unstructured":"Zhang, S., Li, Z., Yan, S., He, X., Sun, J.: Distribution alignment: a unified framework for long-tail visual recognition. In: CVPR, pp. 2361\u20132370 (2021)","DOI":"10.1109\/CVPR46437.2021.00239"},{"key":"14_CR99","doi-asserted-by":"crossref","unstructured":"Zhang, Y., et al.: ByteTrack: multi-object tracking by associating every detection box. arXiv:2110.06864 (2021)","DOI":"10.1007\/978-3-031-20047-2_1"},{"key":"14_CR100","doi-asserted-by":"crossref","unstructured":"Zhang, Y., Wang, C., Wang, X., Zeng, W., Liu, W.: FairMOT: on the fairness of detection and re-identification in multiple object tracking. Int. J. Comput. Vis., 1\u201319 (2021)","DOI":"10.1007\/s11263-021-01513-4"},{"key":"14_CR101","unstructured":"Zhang, Z., Cheng, D., Zhu, X., Lin, S., Dai, J.: Integrated object detection and tracking with tracklet-conditioned detection. arXiv:1811.11167 (2018)"},{"key":"14_CR102","doi-asserted-by":"crossref","unstructured":"Zheng, J., Ma, C., Peng, H., Yang, X.: Learning to track objects from unlabeled videos. In: ICCV, pp. 13546\u201313555 (2021)","DOI":"10.1109\/ICCV48922.2021.01329"},{"key":"14_CR103","unstructured":"Zhou, W., Chang, S., Sosa, N., Hamann, H., Cox, D.: Lifelong object detection. arXiv:2009.01129 (2020)"},{"key":"14_CR104","doi-asserted-by":"crossref","unstructured":"Zhou, X., Girdhar, R., Joulin, A., Kr\u00e4henb\u00fchl, P., Misra, I.: Detecting twenty-thousand classes using image-level supervision. arXiv preprint arXiv:2201.02605 (2022)","DOI":"10.1007\/978-3-031-20077-9_21"},{"key":"14_CR105","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"474","DOI":"10.1007\/978-3-030-58548-8_28","volume-title":"Computer Vision \u2013 ECCV 2020","author":"X Zhou","year":"2020","unstructured":"Zhou, X., Koltun, V., Kr\u00e4henb\u00fchl, P.: Tracking objects as points. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12349, pp. 474\u2013490. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58548-8_28"},{"key":"14_CR106","unstructured":"Zhou, X., Koltun, V., Kr\u00e4henb\u00fchl, P.: Probabilistic two-stage detection. arXiv:2103.07461 (2021)"},{"key":"14_CR107","doi-asserted-by":"crossref","unstructured":"Zhou, X., Yin, T., Koltun, V., Kr\u00e4henb\u00fchl, P.: Global tracking transformers. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8771\u20138780 (2022)","DOI":"10.1109\/CVPR52688.2022.00857"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2022"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-19806-9_14","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,6]],"date-time":"2024-10-06T04:33:50Z","timestamp":1728189230000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-19806-9_14"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022]]},"ISBN":["9783031198052","9783031198069"],"references-count":107,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-19806-9_14","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2022]]},"assertion":[{"value":"20 October 2022","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Tel Aviv","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Israel","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2022","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"23 October 2022","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27 October 2022","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"17","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2022","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2022.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"CMT","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"5804","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"1645","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"28% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.21","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.91","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}