{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,21]],"date-time":"2026-02-21T21:52:32Z","timestamp":1771710752960,"version":"3.50.1"},"reference-count":88,"publisher":"Springer Science and Business Media LLC","issue":"2","license":[{"start":{"date-parts":[[2024,8,23]],"date-time":"2024-08-23T00:00:00Z","timestamp":1724371200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,8,23]],"date-time":"2024-08-23T00:00:00Z","timestamp":1724371200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["U21A20514"],"award-info":[{"award-number":["U21A20514"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62071404"],"award-info":[{"award-number":["62071404"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62002302"],"award-info":[{"award-number":["62002302"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"FuXiaQuan National Independent Innovation Demonstration Zone Collaborative Innovation Platform Project","award":["3502ZCQXT2022008"],"award-info":[{"award-number":["3502ZCQXT2022008"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2025,2]]},"DOI":"10.1007\/s11263-024-02201-9","type":"journal-article","created":{"date-parts":[[2024,8,23]],"date-time":"2024-08-23T17:44:08Z","timestamp":1724435048000},"page":"890-909","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":4,"title":["IMC-Det: Intra\u2013Inter Modality Contrastive Learning for Video Object Detection"],"prefix":"10.1007","volume":"133","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-9127-3404","authenticated-orcid":false,"given":"Qiang","family":"Qi","sequence":"first","affiliation":[]},{"given":"Zhenyu","family":"Qiu","sequence":"additional","affiliation":[]},{"given":"Yan","family":"Yan","sequence":"additional","affiliation":[]},{"given":"Yang","family":"Lu","sequence":"additional","affiliation":[]},{"given":"Hanzi","family":"Wang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,8,23]]},"reference":[{"key":"2201_CR1","doi-asserted-by":"crossref","unstructured":"Adarsh, P., Rathi, P., & Kumar, M. (2020). Yolo v3-tiny: Object detection and recognition using one stage improved model. In Proceedings of the International Conference on Advanced Computing and Communication Systems (pp. 687\u2013694).","DOI":"10.1109\/ICACCS48705.2020.9074315"},{"key":"2201_CR2","doi-asserted-by":"crossref","unstructured":"Cai, Z., & Vasconcelos, N. (2018). Cascade R-CNN: Delving into high quality object detection. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (pp. 6154\u20136162).","DOI":"10.1109\/CVPR.2018.00644"},{"key":"2201_CR3","doi-asserted-by":"crossref","unstructured":"Carion, N., Massa, F., Synnaeve, G., Usunier, N., Kirillov, A., & Zagoruyko, S. (2020). End-to-end object detection with transformers. In Proceedings of the European Conference on Computer Vision (pp. 213\u2013229).","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"2201_CR4","doi-asserted-by":"crossref","unstructured":"Chen, K., Wang, J., Yang, S., Zhang, X., Xiong, Y., Change\u00a0Loy, C., & Lin, D. (2018). Optimizing video object detection via a scale-time lattice. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (pp. 7814\u20137823).","DOI":"10.1109\/CVPR.2018.00815"},{"key":"2201_CR5","doi-asserted-by":"crossref","unstructured":"Chen, Y., Cao, Y., Hu, H., & Wang, L. (2020). Memory enhanced global-local aggregation for video object detection. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (pp. 10337\u201310346).","DOI":"10.1109\/CVPR42600.2020.01035"},{"key":"2201_CR6","doi-asserted-by":"crossref","unstructured":"Chen, Y., Qi, X., Wang, J., & Zhang, L. (2023). Disco-clip: A distributed contrastive loss for memory efficient clip training. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (pp. 22648\u201322657).","DOI":"10.1109\/CVPR52729.2023.02169"},{"key":"2201_CR7","doi-asserted-by":"crossref","unstructured":"Cui, Y., Yan, L., Cao, Z., & Liu, D. (2021). Tf-blender: Temporal feature blender for video object detection. In Proceedings of the IEEE International Conference on Computer Vision (pp. 8138\u20138147).","DOI":"10.1109\/ICCV48922.2021.00803"},{"key":"2201_CR8","doi-asserted-by":"crossref","unstructured":"Damen, D., Doughty, H., Farinella, G. M., Fidler, S., Furnari, A., Kazakos, E., Moltisanti, D., Munro, J., Perrett, T., Price, W., & et\u00a0al. (2018). Scaling egocentric vision: The epic-kitchens dataset. In Proceedings of the European Conference on Computer Vision (pp. 720\u2013736).","DOI":"10.1007\/978-3-030-01225-0_44"},{"key":"2201_CR9","doi-asserted-by":"crossref","unstructured":"Deng, H., Hua, Y., Song, T., Zhang, Z., Xue, Z., Ma, R., Robertson, N., & Guan, H. (2019a). Object guided external memory network for video object detection. In Proceedings of the IEEE International Conference on Computer Vision (pp. 6678\u20136687).","DOI":"10.1109\/ICCV.2019.00678"},{"key":"2201_CR10","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L. -J., Li, K., & Fei-Fei, L. (2009). Imagenet: A large-scale hierarchical image database. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (pp. 248\u2013255).","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"2201_CR11","doi-asserted-by":"publisher","first-page":"6879","DOI":"10.1109\/TIP.2021.3099409","volume":"30","author":"J Deng","year":"2021","unstructured":"Deng, J., Pan, Y., Yao, T., Li, H., & Mei, T. (2021). Minet: Meta-learning instance identifiers for video object detection. IEEE Transactions on Image Processing, 30, 6879\u20136891.","journal-title":"IEEE Transactions on Image Processing"},{"key":"2201_CR12","doi-asserted-by":"crossref","unstructured":"Deng, J., Pan, Y., Yao, T., Zhou, W., Li, H., & Mei, T. (2019b). Relation distillation networks for video object detection. In Proceedings of the IEEE International Conference on Computer Vision (pp. 7023\u20137032).","DOI":"10.1109\/ICCV.2019.00712"},{"key":"2201_CR13","doi-asserted-by":"publisher","first-page":"846","DOI":"10.1109\/TMM.2020.2990070","volume":"23","author":"J Deng","year":"2020","unstructured":"Deng, J., Pan, Y., Yao, T., Zhou, W., Li, H., & Mei, T. (2020). Single shot video object detector. IEEE Transactions on Multimedia, 23, 846\u2013858.","journal-title":"IEEE Transactions on Multimedia"},{"key":"2201_CR14","unstructured":"Devlin, J., Chang, M. -W., Lee, K., & Toutanova, K. (2018). Bert: Pre-training of deep bidirectional transformers for language understanding. Preprint retrieved from arXiv:1810.04805"},{"key":"2201_CR15","unstructured":"Dosovitskiy, A., Beyer, L., Kolesnikov, A., Weissenborn, D., Zhai, X., Unterthiner, T., Dehghani, M., Minderer, M., Heigold, G., Gelly, S., & et\u00a0al. (2021). An image is worth 16x16 words: Transformers for image recognition at scale. In Proceedings of the International Conference on Learning Representations."},{"key":"2201_CR16","doi-asserted-by":"crossref","unstructured":"Fang, Y., Yang, S., Wang, S., Ge, Y., Shan, Y., & Wang, X.(2023). Unleashing vanilla vision transformer with masked image modeling for object detection. In Proceedings of the IEEE International Conference on Computer Vision (pp. 6244\u20136253).","DOI":"10.1109\/ICCV51070.2023.00574"},{"issue":"5","key":"2201_CR17","doi-asserted-by":"publisher","first-page":"1691","DOI":"10.1007\/s11263-020-01428-6","volume":"129","author":"X Fu","year":"2021","unstructured":"Fu, X., Qi, Q., Zha, Z.-J., Ding, X., Wu, F., & Paisley, J. (2021). Successive graph convolutional network for image de-raining. International Journal of Computer Vision, 129(5), 1691\u20131711.","journal-title":"International Journal of Computer Vision"},{"key":"2201_CR18","doi-asserted-by":"crossref","unstructured":"Girshick, R., Donahue, J., Darrell, T., & Malik, J. (2014). Rich feature hierarchies for accurate object detection and semantic segmentation. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (pp. 580\u2013587).","DOI":"10.1109\/CVPR.2014.81"},{"key":"2201_CR19","doi-asserted-by":"crossref","unstructured":"Guo, C., Fan, B., Gu, J., Zhang, Q., Xiang, S., Prinet, V., & Pan, C. (2019). Progressive sparse local attention for video object detection. In Proceedings of the IEEE International Conference on Computer Vision (pp. 3909\u20133918).","DOI":"10.1109\/ICCV.2019.00401"},{"key":"2201_CR20","doi-asserted-by":"crossref","unstructured":"Han, L., Wang, P., Yin, Z., Wang, F., & Li, H. (2020a). Exploiting better feature aggregation for video object detection. In Proceedings of the ACM International Conference on Multimedia (pp. 1469\u20131477).","DOI":"10.1145\/3394171.3413927"},{"issue":"10","key":"2201_CR21","doi-asserted-by":"publisher","first-page":"2927","DOI":"10.1007\/s11263-021-01507-2","volume":"129","author":"L Han","year":"2021","unstructured":"Han, L., Wang, P., Yin, Z., Wang, F., & Li, H. (2021). Context and structure mining network for video object detection. International Journal of Computer Vision, 129(10), 2927\u20132946.","journal-title":"International Journal of Computer Vision"},{"issue":"12","key":"2201_CR22","doi-asserted-by":"publisher","first-page":"8165","DOI":"10.1109\/TCSVT.2021.3094533","volume":"32","author":"L Han","year":"2022","unstructured":"Han, L., Wang, P., Yin, Z., Wang, F., & Li, H. (2022). Class-aware feature aggregation network for video object detection. IEEE Transactions on Circuits and Systems for Video Technology, 32(12), 8165\u20138178.","journal-title":"IEEE Transactions on Circuits and Systems for Video Technology"},{"key":"2201_CR23","doi-asserted-by":"publisher","first-page":"3681","DOI":"10.1109\/TMM.2022.3164253","volume":"25","author":"L Han","year":"2023","unstructured":"Han, L., & Yin, Z. (2023). Global memory and local continuity for video object detection. IEEE Transactions on Multimedia, 25, 3681\u20133693.","journal-title":"IEEE Transactions on Multimedia"},{"key":"2201_CR24","doi-asserted-by":"crossref","unstructured":"Han, M., Wang, Y., Chang, X., & Qiao, Y. (2020b). Mining inter-video proposal relations for video object detection. In Proceedings of the European Conference on Computer Vision (pp. 431\u2013446).","DOI":"10.1007\/978-3-030-58589-1_26"},{"key":"2201_CR25","unstructured":"Han, W., Khorrami, P., Paine, T.L., Ramachandran, P., Babaeizadeh, M., Shi, H., Li, J., Yan, S., & Huang, T.S. (2016). Seq-nms for video object detection. Preprint retrieved from arXiv:1602.08465"},{"issue":"9","key":"2201_CR26","doi-asserted-by":"publisher","first-page":"1027","DOI":"10.1007\/s11263-018-1077-3","volume":"126","author":"H Hattori","year":"2018","unstructured":"Hattori, H., Lee, N., Boddeti, V. N., Beainy, F., Kitani, K. M., & Kanade, T. (2018). Synthesizing a scene-specific pedestrian detector and pose estimator for static video surveillance. International Journal of Computer Vision, 126(9), 1027\u20131044.","journal-title":"International Journal of Computer Vision"},{"key":"2201_CR27","doi-asserted-by":"crossref","unstructured":"He, F., Gao, N., Jia, J., Zhao, X., & Huang, K. (2022a). Queryprop: Object query propagation for high-performance video object detection. In Proceedings of the AAAI Conference on Artificial Intelligence (pp. 2620\u20132627).","DOI":"10.1609\/aaai.v36i1.19965"},{"key":"2201_CR28","doi-asserted-by":"crossref","unstructured":"He, F., Gao, N., Li, Q., Du, S., Zhao, X., & Huang, K. (2020). Temporal context enhanced feature aggregation for video object detection. In Proceedings of the AAAI Conference on Artificial Intelligence (pp. 10941\u201310948).","DOI":"10.1609\/aaai.v34i07.6727"},{"key":"2201_CR29","doi-asserted-by":"crossref","unstructured":"He, F., Li, Q., Zhao, X., & Huang, K. (2022b). Temporal-adaptive sparse feature aggregation for video object detection. Pattern Recognition, 127, 108587.","DOI":"10.1016\/j.patcog.2022.108587"},{"key":"2201_CR30","doi-asserted-by":"crossref","unstructured":"He, K., Gkioxari, G., Doll\u00e1r, P., & Girshick, R. (2017). Mask R-CNN. In Proceedings of the IEEE International Conference on Computer Vision (pp. 2961\u20132969).","DOI":"10.1109\/ICCV.2017.322"},{"key":"2201_CR31","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., & Sun, J. (2016). Deep residual learning for image recognition. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (pp. 770\u2013778).","DOI":"10.1109\/CVPR.2016.90"},{"key":"2201_CR32","doi-asserted-by":"crossref","unstructured":"He, L., Zhou, Q., Li, X., Niu, L., Cheng, G., Li, X., Liu, W., Tong, Y., Ma, L., & Zhang, L. (2021). End-to-end video object detection with spatial-temporal transformers. In Proceedings of the ACM International Conference on Multimedia (pp. 1507\u20131516).","DOI":"10.1145\/3474085.3475285"},{"key":"2201_CR33","doi-asserted-by":"crossref","unstructured":"Jiang, Z., Gao, P., Guo, C., Zhang, Q., Xiang, S., & Pan, C. (2019). Video object detection with locally-weighted deformable neighbors. In Proceedings of the AAAI Conference on Artificial Intelligence (pp. 8529\u20138536).","DOI":"10.1609\/aaai.v33i01.33018529"},{"key":"2201_CR34","doi-asserted-by":"crossref","unstructured":"Jiang, Z., Liu, Y., Yang, C., Liu, J., Zhang, Q., Xiang, S., & Pan, C. (2020). Learning where to focus for efficient video object detection. In Proceedings of the European Conference on Computer Vision (pp. 18\u201334).","DOI":"10.1007\/978-3-030-58517-4_2"},{"issue":"4","key":"2201_CR35","doi-asserted-by":"publisher","first-page":"1793","DOI":"10.1109\/TPAMI.2020.3029948","volume":"44","author":"G Kang","year":"2022","unstructured":"Kang, G., Jiang, L., Wei, Y., Yang, Y., & Hauptmann, A. G. (2022). Contrastive adaptation network for single- and multi-source domain adaptation. IEEE Transactions on Pattern Analysis and Machine Intelligence, 44(4), 1793\u20131804.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"issue":"10","key":"2201_CR36","doi-asserted-by":"publisher","first-page":"2896","DOI":"10.1109\/TCSVT.2017.2736553","volume":"28","author":"K Kang","year":"2017","unstructured":"Kang, K., Li, H., Yan, J., Zeng, X., Yang, B., Xiao, T., Zhang, C., Wang, Z., Wang, R., Wang, X., et al. (2017). T-cnn: Tubelets with convolutional neural networks for object detection from videos. IEEE Transactions on Circuits and Systems for Video Technology, 28(10), 2896\u20132907.","journal-title":"IEEE Transactions on Circuits and Systems for Video Technology"},{"key":"2201_CR37","unstructured":"Khosla, P., Teterwak, P., Wang, C., Sarna, A., Tian, Y., Isola, P., Maschinot, A., Liu, C., & Krishnan, D. (2020). Supervised contrastive learning. In Proceedings of the Advances in Neural Information Processing Systems (pp. 18661\u201318673)."},{"key":"2201_CR38","doi-asserted-by":"crossref","unstructured":"Kim, S., Park, S., Na, B., & Yoon, S. (2020). Spiking-yolo: Spiking neural network for energy-efficient object detection. In Proceedings of the AAAI Conference on Artificial Intelligence (pp. 11270\u201311277).","DOI":"10.1609\/aaai.v34i07.6787"},{"key":"2201_CR39","unstructured":"Kipf, T.N., & Welling, M. (2017). Semi-supervised classification with graph convolutional networks. In Processings of the International Conference on Learning Representations (pp. 565\u2013578)."},{"key":"2201_CR40","doi-asserted-by":"crossref","unstructured":"Li, F., Zhang, H., Xu, H., Liu, S., Zhang, L., Ni, L. M., & Shum, H. -Y. (2023).Mask dino: Towards a unified transformer-based framework for object detection and segmentation. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (pp. 3041\u20133050).","DOI":"10.1109\/CVPR52729.2023.00297"},{"issue":"3","key":"2201_CR41","doi-asserted-by":"publisher","first-page":"225","DOI":"10.1007\/s11263-018-1101-7","volume":"127","author":"H Li","year":"2019","unstructured":"Li, H., Liu, Y., Ouyang, W., & Wang, X. (2019). Zoom out-and-in network with map attention decision for region proposal and object detection. International Journal of Computer Vision, 127(3), 225\u2013238.","journal-title":"International Journal of Computer Vision"},{"key":"2201_CR42","doi-asserted-by":"crossref","unstructured":"Li, H., Pan, X., Yan, K., Tang, F., & Zheng, W. -S. (2022a). Siod: Single instance annotated per category per image for object detection. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (pp. 14197\u201314206).","DOI":"10.1109\/CVPR52688.2022.01380"},{"key":"2201_CR43","doi-asserted-by":"crossref","unstructured":"Li, J., Cheng, B., Feris, R., Xiong, J., Huang, T. S., Hwu, W. -M., & Shi, H. (2021a). Pseudo-iou: Improving label assignment in anchor-free object detection. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition Workshops (pp. 2378\u20132387).","DOI":"10.1109\/CVPRW53098.2021.00270"},{"key":"2201_CR44","doi-asserted-by":"crossref","unstructured":"Li, Y., Mao, H., Girshick, R., & He, K. (2022b). Exploring plain vision transformer backbones for object detection. In Proceedings of the European Conference on Computer Vision (pp. 280\u2013296).","DOI":"10.1007\/978-3-031-20077-9_17"},{"key":"2201_CR45","doi-asserted-by":"crossref","unstructured":"Li, Z., Xi, T., Zhang, G., Liu, J., & He, R. (2021b). Autodet: Pyramid network architecture search for object detection. International Journal of Computer Vision, 129(4), 1087\u20131105.","DOI":"10.1007\/s11263-020-01415-x"},{"key":"2201_CR46","doi-asserted-by":"crossref","unstructured":"Lin, L., Chen, H., Zhang, H., Li, Y., Shan, Y., & Wang, H. (2020). Dual semantic fusion network for video object detection. In Proceedings of the ACM International Conference on Multimedia (pp. 1855\u20131863).","DOI":"10.1145\/3394171.3413583"},{"key":"2201_CR47","doi-asserted-by":"crossref","unstructured":"Lin, T. -Y., Dollar, P., Girshick, R., He, K., Hariharan, B., & Belongie, S. (2017a). Feature pyramid networks for object detection. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (pp. 2117\u20132125).","DOI":"10.1109\/CVPR.2017.106"},{"key":"2201_CR48","doi-asserted-by":"crossref","unstructured":"Lin, T. -Y., Goyal, P., Girshick, R., He, K., & Dollar, P. (2017b). Focal loss for dense object detection. In Proceedings of the IEEE International Conference on Computer Vision (pp. 2980\u20132988).","DOI":"10.1109\/ICCV.2017.324"},{"issue":"2","key":"2201_CR49","doi-asserted-by":"publisher","first-page":"261","DOI":"10.1007\/s11263-019-01247-4","volume":"128","author":"L Liu","year":"2020","unstructured":"Liu, L., Ouyang, W., Wang, X., Fieguth, P., Chen, J., Liu, X., & Pietik\u00e4inen, M. (2020). Deep learning for generic object detection: A survey. International Journal of Computer Vision, 128(2), 261\u2013318.","journal-title":"International Journal of Computer Vision"},{"key":"2201_CR50","doi-asserted-by":"crossref","unstructured":"Liu, W., Anguelov, D., Erhan, D., Szegedy, C., Reed, S., Fu, C. -Y., & Berg, A. C. (2016). Ssd: Single shot multibox detector. In Proceedings of the European Conference on Computer Vision (pp. 21\u201337).","DOI":"10.1007\/978-3-319-46448-0_2"},{"key":"2201_CR51","doi-asserted-by":"crossref","unstructured":"Liu, Z., Lin, Y., Cao, Y., Hu, H., Wei, Y., Zhang, Z., Lin, S., & Guo, B. (2021). Swin transformer: Hierarchical vision transformer using shifted windows. In Proceedings of the IEEE International Conference on Computer Vision (pp. 10012\u201310022).","DOI":"10.1109\/ICCV48922.2021.00986"},{"issue":"3","key":"2201_CR52","doi-asserted-by":"publisher","first-page":"287","DOI":"10.1007\/s11263-006-0024-x","volume":"74","author":"A Lookingbill","year":"2007","unstructured":"Lookingbill, A., Rogers, J., Lieb, D., Curry, J., & Thrun, S. (2007). Reverse optical flow for self-supervised adaptive autonomous robot navigation. International Journal of Computer Vision, 74(3), 287\u2013302.","journal-title":"International Journal of Computer Vision"},{"key":"2201_CR53","unstructured":"Luo, H., Huang, L., Shen, H., Li, Y., Huang, C., Wang, X. (2019). Object detection in video with spatial-temporal context aggregation. Preprint retrieved from arXiv:1907.04988"},{"key":"2201_CR54","doi-asserted-by":"crossref","unstructured":"Nan, G., Qiao, R., Xiao, Y., Liu, J., Leng, S., Zhang, H., & Lu, W. (2021). Interventional video grounding with dual contrastive learning. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (pp. 2765\u20132775).","DOI":"10.1109\/CVPR46437.2021.00279"},{"issue":"6","key":"2201_CR55","doi-asserted-by":"publisher","first-page":"7853","DOI":"10.1109\/TPAMI.2022.3223955","volume":"45","author":"Z Qianyu","year":"2023","unstructured":"Qianyu, Z., Li, X., He, L., Yang, Y., Cheng, G., Tong, Y., Ma, L., & Tao, D. (2023). Transvod: End-to-end video object detection with spatial-temporal transformers. IEEE Transactions on Pattern Analysis and Machine Intelligence, 45(6), 7853\u20137869.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"2201_CR56","doi-asserted-by":"publisher","first-page":"4128","DOI":"10.1109\/TIP.2023.3285136","volume":"32","author":"Q Qi","year":"2023","unstructured":"Qi, Q., Hou, T., Lu, Y., Yan, Y., & Wang, H. (2023). Dgrnet: A dual-level graph relation network for video object detection. IEEE Transactions on Image Processing, 32, 4128\u20134141.","journal-title":"IEEE Transactions on Image Processing"},{"issue":"11","key":"2201_CR57","doi-asserted-by":"publisher","first-page":"20926","DOI":"10.1109\/TITS.2022.3176721","volume":"23","author":"Q Qi","year":"2022","unstructured":"Qi, Q., Wang, X., Hou, T., Yan, Y., & Wang, H. (2022). Fastvod-net: A real-time and high-accuracy video object detector. IEEE Transactions on Intelligent Transportation Systems, 23(11), 20926\u201320942.","journal-title":"IEEE Transactions on Intelligent Transportation Systems"},{"key":"2201_CR58","unstructured":"Radford, A., Kim, J.W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., Sastry, G., Askell, A., Mishkin, P., & Clark, J., et\u00a0al. (2021). Learning transferable visual models from natural language supervision. In Proceedings of the International Conference on Machine Learning (pp. 8748\u20138763)."},{"key":"2201_CR59","doi-asserted-by":"crossref","unstructured":"Redmon, J., Divvala, S., Girshick, R., & Farhadi, A. (2016). You only look once: Unified, real-time object detection. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (pp. 779\u2013788).","DOI":"10.1109\/CVPR.2016.91"},{"key":"2201_CR60","unstructured":"Ren, S., He, K., Girshick, R., & Sun, J. (2015). Faster R-CNN: Towards real-time object detection with region proposal networks. In Proceedings of the Advances in Neural Information Processing Systems (pp. 91\u201399)."},{"issue":"3","key":"2201_CR61","doi-asserted-by":"publisher","first-page":"211","DOI":"10.1007\/s11263-015-0816-y","volume":"115","author":"O Russakovsky","year":"2015","unstructured":"Russakovsky, O., Deng, J., Su, H., Krause, J., Satheesh, S., Ma, S., Huang, Z., Karpathy, A., Khosla, A., Bernstein, M., et al. (2015). Imagenet large scale visual recognition challenge. International Journal of Computer Vision, 115(3), 211\u2013252.","journal-title":"International Journal of Computer Vision"},{"key":"2201_CR62","doi-asserted-by":"crossref","unstructured":"Shvets, M., Liu, W., & Berg, A. C. (2019). Leveraging long-range temporal relationships between proposals for video object detection. In Proceedings of the IEEE International Conference on Computer Vision (pp. 9756\u20139764).","DOI":"10.1109\/ICCV.2019.00985"},{"key":"2201_CR63","doi-asserted-by":"crossref","unstructured":"Sun, G., Hua, Y., Hu, G., & Robertson, N. (2021a). Mamba: Multi-level aggregation via memory bank for video object detection. In Proceedings of the AAAI Conference on Artificial Intelligence (pp. 2620\u20132627).","DOI":"10.1609\/aaai.v35i3.16365"},{"key":"2201_CR64","doi-asserted-by":"crossref","unstructured":"Sun, P., Zhang, R., Jiang, Y., Kong, T., Xu, C., Zhan, W., Tomizuka, M., Li, L., Yuan, Z., Wang, C., & Luo, P. (2021b). Sparse R-CNN: End-to-end object detection with learnable proposals. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (pp. 14454\u201314463).","DOI":"10.1109\/CVPR46437.2021.01422"},{"issue":"2","key":"2201_CR65","doi-asserted-by":"publisher","first-page":"393","DOI":"10.1109\/TMM.2016.2614862","volume":"19","author":"Y Tang","year":"2017","unstructured":"Tang, Y., Wang, X., Dellandr\u00e9a, E., & Chen, L. (2017). Weakly supervised learning of deformable part-based models for object detection via region proposals. IEEE Transactions on Multimedia, 19(2), 393\u2013407.","journal-title":"IEEE Transactions on Multimedia"},{"issue":"5","key":"2201_CR66","doi-asserted-by":"publisher","first-page":"1272","DOI":"10.1109\/TPAMI.2019.2910529","volume":"42","author":"P Tang","year":"2019","unstructured":"Tang, P., Wang, C., Wang, X., Liu, W., Zeng, W., & Wang, J. (2019). Object detection in videos by high quality object linking. IEEE Transactions on Pattern Analysis and Machine Intelligence, 42(5), 1272\u20131278.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"issue":"4","key":"2201_CR67","first-page":"1922","volume":"44","author":"Z Tian","year":"2022","unstructured":"Tian, Z., Shen, C., Chen, H., & He, T. (2022). Fcos: A simple and strong anchor-free object detector. IEEE Transactions on Pattern Analysis and Machine Intelligence, 44(4), 1922\u20131933.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"issue":"11","key":"2201_CR68","first-page":"2579","volume":"9","author":"L Van der Maaten","year":"2008","unstructured":"Van der Maaten, L., & Hinton, G. (2008). Visualizing data using t-SNE. Journal of Machine Learning Research, 9(11), 2579\u20132605.","journal-title":"Journal of Machine Learning Research"},{"key":"2201_CR69","doi-asserted-by":"crossref","unstructured":"Wang, H., Tang, J., Liu, X., Guan, S., Xie, R., & Song, L. (2022). Ptseformer: Progressive temporal-spatial enhanced transformer towards video object detection. In Proceedings of the European Conference on Computer Vision (pp. 732\u2013747).","DOI":"10.1007\/978-3-031-20074-8_42"},{"key":"2201_CR70","doi-asserted-by":"crossref","unstructured":"Wang, S., Zhou, Y., Yan, J., & Deng, Z. (2018). Fully motion-aware network for video object detection. In Proceedings of the European Conference on Computer Vision (pp. 542\u2013557).","DOI":"10.1007\/978-3-030-01261-8_33"},{"key":"2201_CR71","doi-asserted-by":"crossref","unstructured":"Wu, H., Chen, Y., Wang, N., & Zhang, Z. (2019). Sequence level semantics aggregation for video object detection. In Proceedings of the IEEE International Conference on Computer Vision (pp. 9217\u20139225).","DOI":"10.1109\/ICCV.2019.00931"},{"key":"2201_CR72","doi-asserted-by":"crossref","unstructured":"Wu, H., Qu, Y., Lin, S., Zhou, J., Qiao, R., Zhang, Z., Xie, Y., & Ma, L. (2021). Contrastive learning for compact single image dehazing. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (pp. 10551\u201310560).","DOI":"10.1109\/CVPR46437.2021.01041"},{"key":"2201_CR73","doi-asserted-by":"crossref","unstructured":"Xiao, F., & Jae\u00a0Lee, Y. (2018). Video object detection with an aligned spatial-temporal memory. In Proceedings of the European Conference on Computer Vision (pp. 485\u2013501).","DOI":"10.1007\/978-3-030-01237-3_30"},{"key":"2201_CR74","doi-asserted-by":"crossref","unstructured":"Xie, S., Girshick, R., Doll\u00e1r, P., Tu, Z., & He, K. (2017). Aggregated residual transformations for deep neural networks. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (pp. 1492\u20131500).","DOI":"10.1109\/CVPR.2017.634"},{"key":"2201_CR75","doi-asserted-by":"crossref","unstructured":"Xie, X., Cheng, G., Wang, J., Yao, X., & Han, J.(2021). Oriented R-CNN for object detection. In Proceedings of the IEEE International Conference on Computer Vision (pp. 3520\u20133529).","DOI":"10.1109\/ICCV48922.2021.00350"},{"key":"2201_CR76","doi-asserted-by":"crossref","unstructured":"Xing, B., Ying, X., Wang, R., Yang, J., & Chen, T. (2023). Cross-modal contrastive learning for domain adaptation in 3d semantic segmentation. In Proceedings of the AAAI Conference on Artificial Intelligence (pp. 2974\u20132982).","DOI":"10.1609\/aaai.v37i3.25400"},{"issue":"11","key":"2201_CR77","doi-asserted-by":"publisher","first-page":"7809","DOI":"10.1109\/TCSVT.2022.3183646","volume":"32","author":"C Xu","year":"2022","unstructured":"Xu, C., Zhang, J., Wang, M., Tian, G., & Liu, Y. (2022). Multi-level spatial-temporal feature aggregation for video object detection. IEEE Transactions on Circuits and Systems for Video Technology, 32(11), 7809\u20137820.","journal-title":"IEEE Transactions on Circuits and Systems for Video Technology"},{"issue":"11","key":"2201_CR78","doi-asserted-by":"publisher","first-page":"8557","DOI":"10.1109\/TIE.2018.2886798","volume":"66","author":"H Yang","year":"2018","unstructured":"Yang, H., Qi, J., Miao, Y., Sun, H., & Li, J. (2018). A new robot navigation algorithm based on a double-layer ant algorithm and trajectory optimization. IEEE Transactions on Industrial Electronics, 66(11), 8557\u20138566.","journal-title":"IEEE Transactions on Industrial Electronics"},{"key":"2201_CR79","doi-asserted-by":"crossref","unstructured":"Yang, Z., Qin, J., & Huang, D. (2022). Acgnet: Action complement graph network for weakly-supervised temporal action localization. In Proceedings of the AAAI Conference on Artificial Intelligence (pp. 3090\u20133098).","DOI":"10.1609\/aaai.v36i3.20216"},{"key":"2201_CR80","doi-asserted-by":"crossref","unstructured":"Ye, S., Xie, Y., Chen, D., Xu, Y., Yuan, L., Zhu, C., & Liao, J. (2023). Improving commonsense in vision-language models via knowledge graph riddles. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (pp. 2634\u20132645).","DOI":"10.1109\/CVPR52729.2023.00259"},{"key":"2201_CR81","doi-asserted-by":"publisher","first-page":"2078","DOI":"10.1109\/TIP.2019.2947806","volume":"29","author":"H Zhang","year":"2020","unstructured":"Zhang, H., Tian, Y., Wang, K., Zhang, W., & Wang, F.-Y. (2020). Mask SSD: An effective single-stage approach to object instance segmentation. IEEE Transactions on Image Processing, 29, 2078\u20132093.","journal-title":"IEEE Transactions on Image Processing"},{"issue":"9","key":"2201_CR82","first-page":"5185","volume":"44","author":"J Zhang","year":"2022","unstructured":"Zhang, J., Jia, X., Hu, J., & Tan, K. (2022). Moving vehicle detection for remote sensing video surveillance with nonstationary satellite platform. IEEE Transactions on Pattern Analysis and Machine Intelligence, 44(9), 5185\u20135198.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"issue":"9","key":"2201_CR83","doi-asserted-by":"publisher","first-page":"3848","DOI":"10.1109\/TITS.2019.2935152","volume":"21","author":"L Zhao","year":"2019","unstructured":"Zhao, L., Song, Y., Zhang, C., Liu, Y., Wang, P., Lin, T., Deng, M., & Li, H. (2019). T-GCN: A temporal graph convolutional network for traffic prediction. IEEE Transactions on Intelligent Transportation Systems, 21(9), 3848\u20133858.","journal-title":"IEEE Transactions on Intelligent Transportation Systems"},{"key":"2201_CR84","doi-asserted-by":"crossref","unstructured":"Zheng, W., Tang, W., Jiang, L., Fu, & C. -W. (2021). SE-SSD: Self-ensembling single-stage object detector from point cloud. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (pp. 14494\u201314503).","DOI":"10.1109\/CVPR46437.2021.01426"},{"key":"2201_CR85","doi-asserted-by":"crossref","unstructured":"Zhu, C., He, Y., & Savvides, M. (2019). Feature selective anchor-free module for single-shot object detection. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (pp. 840\u2013849).","DOI":"10.1109\/CVPR.2019.00093"},{"key":"2201_CR86","doi-asserted-by":"crossref","unstructured":"Zhu, X., Dai, J., Yuan, L., & Wei, Y. (2018). Towards high performance video object detection. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (pp. 7210\u20137218).","DOI":"10.1109\/CVPR.2018.00753"},{"key":"2201_CR87","doi-asserted-by":"crossref","unstructured":"Zhu, X., Xiong, Y., Dai, J., Yuan, L., & Wei, Y. (2017a). Deep feature flow for video recognition. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (pp. 2349\u20132358).","DOI":"10.1109\/CVPR.2017.441"},{"key":"2201_CR88","doi-asserted-by":"crossref","unstructured":"Zhu, X., Wang, Y., Dai, J., Yuan, L., & Wei, Y. (2017b). Flow-guided feature aggregation for video object detection. In Proceedings of the IEEE International Conference on Computer Vision (pp. 408\u2013417).","DOI":"10.1109\/ICCV.2017.52"}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-024-02201-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11263-024-02201-9\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-024-02201-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,1,22]],"date-time":"2025-01-22T06:42:23Z","timestamp":1737528143000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11263-024-02201-9"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,8,23]]},"references-count":88,"journal-issue":{"issue":"2","published-print":{"date-parts":[[2025,2]]}},"alternative-id":["2201"],"URL":"https:\/\/doi.org\/10.1007\/s11263-024-02201-9","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"value":"0920-5691","type":"print"},{"value":"1573-1405","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,8,23]]},"assertion":[{"value":"2 April 2023","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"27 July 2024","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"23 August 2024","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}