{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,29]],"date-time":"2026-05-29T02:02:29Z","timestamp":1780020149045,"version":"3.53.1"},"reference-count":59,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Knowledge-Based Systems"],"published-print":{"date-parts":[[2026,7]]},"DOI":"10.1016\/j.knosys.2026.116117","type":"journal-article","created":{"date-parts":[[2026,5,9]],"date-time":"2026-05-09T15:15:25Z","timestamp":1778339725000},"page":"116117","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["StratiFormer: Stratified Temporal Transformer for video object detection"],"prefix":"10.1016","volume":"346","author":[{"ORCID":"https:\/\/orcid.org\/0009-0001-8304-947X","authenticated-orcid":false,"given":"Wentao","family":"Zheng","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Hong","family":"Zheng","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yuquan","family":"Sun","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Ying","family":"Jing","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"78","reference":[{"key":"10.1016\/j.knosys.2026.116117_b1","doi-asserted-by":"crossref","DOI":"10.1016\/j.knosys.2025.113237","article-title":"FedVOD: A two-stage video object detector training framework based on federated unsupervised learning and feature post-processing","volume":"315","author":"Hu","year":"2025","journal-title":"Knowl.-Based Syst."},{"key":"10.1016\/j.knosys.2026.116117_b2","doi-asserted-by":"crossref","DOI":"10.1016\/j.knosys.2025.113200","article-title":"Weakly-supervised spatial\u2013temporal video grounding via spatial\u2013temporal annotation on a single frame","volume":"314","author":"Luo","year":"2025","journal-title":"Knowl.-Based Syst."},{"issue":"8","key":"10.1016\/j.knosys.2026.116117_b3","doi-asserted-by":"crossref","first-page":"3195","DOI":"10.1109\/TNNLS.2021.3053249","article-title":"New generation deep learning for video object detection: A survey","volume":"33","author":"Jiao","year":"2021","journal-title":"IEEE Trans. Neural Networks Learn. Syst."},{"key":"10.1016\/j.knosys.2026.116117_b4","doi-asserted-by":"crossref","first-page":"39","DOI":"10.1016\/j.neucom.2020.01.085","article-title":"Recent advances in deep learning for object detection","volume":"396","author":"Wu","year":"2020","journal-title":"Neurocomputing"},{"key":"10.1016\/j.knosys.2026.116117_b5","doi-asserted-by":"crossref","DOI":"10.1016\/j.knosys.2025.114067","article-title":"TA-EnVODT: A cloud-edge integrated temporal-aware ensemble for real-time video object detection and tracking using knowledge projection","volume":"326","author":"S. K","year":"2025","journal-title":"Knowl.-Based Syst."},{"key":"10.1016\/j.knosys.2026.116117_b6","doi-asserted-by":"crossref","unstructured":"X. Zhu, Y. Wang, J. Dai, L. Yuan, Y. Wei, Flow-Guided Feature Aggregation for Video Object Detection, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, ICCV, 2017, pp. 408\u2013417.","DOI":"10.1109\/ICCV.2017.52"},{"key":"10.1016\/j.knosys.2026.116117_b7","doi-asserted-by":"crossref","unstructured":"X. Zhu, Y. Xiong, J. Dai, L. Yuan, Y. Wei, Deep Feature Flow for Video Recognition, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR, 2017, pp. 4141\u20134150.","DOI":"10.1109\/CVPR.2017.441"},{"key":"10.1016\/j.knosys.2026.116117_b8","doi-asserted-by":"crossref","unstructured":"A. Dosovitskiy, P. Fischer, E. Ilg, P. H\u00e4usser, C. Hazirbas, V. Golkov, P. van der Smagt, D. Cremers, T. Brox, FlowNet: Learning Optical Flow with Convolutional Networks, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, ICCV, 2015, pp. 2758\u20132766.","DOI":"10.1109\/ICCV.2015.316"},{"key":"10.1016\/j.knosys.2026.116117_b9","doi-asserted-by":"crossref","unstructured":"J. Deng, Y. Pan, T. Yao, W. Zhou, H. Li, T. Mei, Relation Distillation Networks for Video Object Detection, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, ICCV, 2019, pp. 7022\u20137031.","DOI":"10.1109\/ICCV.2019.00712"},{"key":"10.1016\/j.knosys.2026.116117_b10","doi-asserted-by":"crossref","unstructured":"H. Wu, Y. Chen, N. Wang, Z. Zhang, Sequence Level Semantics Aggregation for Video Object Detection, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, ICCV, 2019, pp. 9216\u20139224.","DOI":"10.1109\/ICCV.2019.00931"},{"key":"10.1016\/j.knosys.2026.116117_b11","doi-asserted-by":"crossref","unstructured":"L. He, Q. Zhou, X. Li, L. Niu, G. Cheng, X. Li, W. Liu, Y. Tong, L. Ma, L. Zhang, End-to-end video object detection with spatial-temporal transformers, in: Proceedings of the 29th ACM International Conference on Multimedia, 2021, pp. 1507\u20131516.","DOI":"10.1145\/3474085.3475285"},{"issue":"6","key":"10.1016\/j.knosys.2026.116117_b12","doi-asserted-by":"crossref","first-page":"7853","DOI":"10.1109\/TPAMI.2022.3223955","article-title":"TransVOD: End-to-end video object detection with spatial-temporal transformers","volume":"45","author":"Zhou","year":"2023","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.knosys.2026.116117_b13","doi-asserted-by":"crossref","unstructured":"Y. Chen, Y. Cao, H. Hu, L. Wang, Memory Enhanced Global-Local Aggregation for Video Object Detection, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR, 2020, pp. 10334\u201310343.","DOI":"10.1109\/CVPR42600.2020.01035"},{"key":"10.1016\/j.knosys.2026.116117_b14","doi-asserted-by":"crossref","unstructured":"T.-Y. Lin, P. Doll\u00e1r, R. Girshick, K. He, B. Hariharan, S. Belongie, Feature pyramid networks for object detection, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR, 2017, pp. 2117\u20132125.","DOI":"10.1109\/CVPR.2017.106"},{"key":"10.1016\/j.knosys.2026.116117_b15","doi-asserted-by":"crossref","DOI":"10.1016\/j.knosys.2024.112505","article-title":"Bridging spatiotemporal feature gap for video salient object detection","volume":"304","author":"Tan","year":"2024","journal-title":"Knowl.-Based Syst."},{"key":"10.1016\/j.knosys.2026.116117_b16","doi-asserted-by":"crossref","first-page":"1992","DOI":"10.1109\/TIP.2023.3261752","article-title":"Cyclic self-training with proposal weight modulation for cross-supervised object detection","volume":"32","author":"Xu","year":"2023","journal-title":"IEEE Trans. Image Process."},{"issue":"3","key":"10.1016\/j.knosys.2026.116117_b17","doi-asserted-by":"crossref","first-page":"211","DOI":"10.1007\/s11263-015-0816-y","article-title":"Imagenet large scale visual recognition challenge","volume":"115","author":"Russakovsky","year":"2015","journal-title":"Int. J. Comput. Vis."},{"key":"10.1016\/j.knosys.2026.116117_b18","doi-asserted-by":"crossref","unstructured":"R.B. Girshick, J. Donahue, T. Darrell, J. Malik, Rich Feature Hierarchies for Accurate Object Detection and Semantic Segmentation, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR, 2014, pp. 580\u2013587.","DOI":"10.1109\/CVPR.2014.81"},{"key":"10.1016\/j.knosys.2026.116117_b19","doi-asserted-by":"crossref","unstructured":"R.B. Girshick, Fast R-CNN, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, ICCV, 2015, pp. 1440\u20131448.","DOI":"10.1109\/ICCV.2015.169"},{"key":"10.1016\/j.knosys.2026.116117_b20","unstructured":"S. Ren, K. He, R.B. Girshick, J. Sun, Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks, in: Advances in Neural Information Processing Systems (NeurIPS), 2015, pp. 91\u201399."},{"key":"10.1016\/j.knosys.2026.116117_b21","doi-asserted-by":"crossref","unstructured":"J. Redmon, S.K. Divvala, R.B. Girshick, A. Farhadi, You Only Look Once: Unified, Real-Time Object Detection, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR, 2016, pp. 779\u2013788.","DOI":"10.1109\/CVPR.2016.91"},{"key":"10.1016\/j.knosys.2026.116117_b22","doi-asserted-by":"crossref","unstructured":"J. Redmon, A. Farhadi, YOLO9000: Better, Faster, Stronger, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR, 2017, pp. 6517\u20136525.","DOI":"10.1109\/CVPR.2017.690"},{"key":"10.1016\/j.knosys.2026.116117_b23","doi-asserted-by":"crossref","unstructured":"W. Liu, D. Anguelov, D. Erhan, C. Szegedy, S. Reed, C.-Y. Fu, A.C. Berg, Ssd: Single shot multibox detector, in: Proceedings of the European Conference on Computer Vision, ECCV, 2016, pp. 21\u201337.","DOI":"10.1007\/978-3-319-46448-0_2"},{"key":"10.1016\/j.knosys.2026.116117_b24","doi-asserted-by":"crossref","unstructured":"N. Carion, F. Massa, G. Synnaeve, N. Usunier, A. Kirillov, S. Zagoruyko, End-to-end object detection with transformers, in: Proceedings of the European Conference on Computer Vision, ECCV, 2020, pp. 213\u2013229.","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"10.1016\/j.knosys.2026.116117_b25","unstructured":"X. Zhu, W. Su, L. Lu, B. Li, X. Wang, J. Dai, Deformable DETR: Deformable Transformers for End-to-End Object Detection, in: Proceedings of the International Conference on Learning Representations, ICLR, 2021."},{"key":"10.1016\/j.knosys.2026.116117_b26","doi-asserted-by":"crossref","unstructured":"D. Meng, X. Chen, Z. Fan, G. Zeng, H. Li, Y. Yuan, L. Sun, J. Wang, Conditional DETR for Fast Training Convergence, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, ICCV, 2021, pp. 3631\u20133640.","DOI":"10.1109\/ICCV48922.2021.00363"},{"key":"10.1016\/j.knosys.2026.116117_b27","unstructured":"S. Liu, F. Li, H. Zhang, X. Yang, X. Qi, H. Su, J. Zhu, L. Zhang, DAB-DETR: Dynamic Anchor Boxes are Better Queries for DETR, in: Proceedings of the International Conference on Learning Representations, ICLR, 2022."},{"key":"10.1016\/j.knosys.2026.116117_b28","doi-asserted-by":"crossref","unstructured":"F. Li, H. Zhang, S. Liu, J. Guo, L.M. Ni, L. Zhang, Dn-detr: Accelerate detr training by introducing query denoising, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR, 2022, pp. 13619\u201313627.","DOI":"10.1109\/CVPR52688.2022.01325"},{"key":"10.1016\/j.knosys.2026.116117_b29","doi-asserted-by":"crossref","unstructured":"Y. Zhao, W. Lv, S. Xu, J. Wei, G. Wang, Q. Dang, Y. Liu, J. Chen, DETRs Beat YOLOs on Real-time Object Detection, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR, 2024, pp. 16965\u201316974.","DOI":"10.1109\/CVPR52733.2024.01605"},{"key":"10.1016\/j.knosys.2026.116117_b30","series-title":"Anchor DETR: Query design for transformer-based object detection","author":"Wang","year":"2021"},{"key":"10.1016\/j.knosys.2026.116117_b31","doi-asserted-by":"crossref","unstructured":"X. Hou, M. Liu, S. Zhang, P. Wei, B. Chen, X. Lan, Relation detr: Exploring explicit position relation prior for object detection, in: Proceedings of the European Conference on Computer Vision, ECCV, 2024, pp. 89\u2013105.","DOI":"10.1007\/978-3-031-72973-7_6"},{"issue":"3","key":"10.1016\/j.knosys.2026.116117_b32","doi-asserted-by":"crossref","first-page":"257","DOI":"10.1109\/JPROC.2023.3238524","article-title":"Object detection in 20 years: A survey","volume":"111","author":"Zou","year":"2023","journal-title":"Proc. IEEE"},{"key":"10.1016\/j.knosys.2026.116117_b33","doi-asserted-by":"crossref","unstructured":"Z. Jiang, Y. Liu, C. Yang, J. Liu, P. Gao, Q. Zhang, S. Xiang, C. Pan, Learning where to focus for efficient video object detection, in: Proceedings of the European Conference on Computer Vision, ECCV, 2020, pp. 18\u201334.","DOI":"10.1007\/978-3-030-58517-4_2"},{"issue":"10","key":"10.1016\/j.knosys.2026.116117_b34","doi-asserted-by":"crossref","first-page":"2896","DOI":"10.1109\/TCSVT.2017.2736553","article-title":"T-CNN: Tubelets with convolutional neural networks for object detection from videos","volume":"28","author":"Kang","year":"2018","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"10.1016\/j.knosys.2026.116117_b35","doi-asserted-by":"crossref","unstructured":"S. Wang, Y. Zhou, J. Yan, Z. Deng, Fully motion-aware network for video object detection, in: Proceedings of the European Conference on Computer Vision, ECCV, 2018, pp. 542\u2013557.","DOI":"10.1007\/978-3-030-01261-8_33"},{"key":"10.1016\/j.knosys.2026.116117_b36","doi-asserted-by":"crossref","unstructured":"C. Guo, B. Fan, J. Gu, Q. Zhang, S. Xiang, V. Prinet, C. Pan, Progressive Sparse Local Attention for Video Object Detection, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, ICCV, 2019, pp. 3908\u20133917.","DOI":"10.1109\/ICCV.2019.00401"},{"key":"10.1016\/j.knosys.2026.116117_b37","doi-asserted-by":"crossref","unstructured":"H. Deng, Y. Hua, T. Song, Z. Zhang, Z. Xue, R. Ma, N.M. Robertson, H. Guan, Object Guided External Memory Network for Video Object Detection, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, ICCV, 2019, pp. 6677\u20136686.","DOI":"10.1109\/ICCV.2019.00678"},{"key":"10.1016\/j.knosys.2026.116117_b38","doi-asserted-by":"crossref","unstructured":"G. Sun, Y. Hua, G. Hu, N. Robertson, MAMBA: Multi-level Aggregation via Memory Bank for Video Object Detection, in: Proceedings of the AAAI Conference on Artificial Intelligence, AAAI, 2021, pp. 2620\u20132627.","DOI":"10.1609\/aaai.v35i3.16365"},{"key":"10.1016\/j.knosys.2026.116117_b39","series-title":"Seq-nms for video object detection","author":"Han","year":"2016"},{"key":"10.1016\/j.knosys.2026.116117_b40","doi-asserted-by":"crossref","first-page":"3229","DOI":"10.1109\/TIP.2021.3058599","article-title":"Holistic LSTM for pedestrian trajectory prediction","volume":"30","author":"Quan","year":"2021","journal-title":"IEEE Trans. Image Process."},{"issue":"8","key":"10.1016\/j.knosys.2026.116117_b41","doi-asserted-by":"crossref","first-page":"10055","DOI":"10.1109\/TPAMI.2023.3262578","article-title":"Local-global context aware transformer for language-guided video segmentation","volume":"45","author":"Liang","year":"2023","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.knosys.2026.116117_b42","doi-asserted-by":"crossref","first-page":"3946","DOI":"10.1109\/TIP.2021.3066912","article-title":"Progressive transfer learning for face anti-spoofing","volume":"30","author":"Quan","year":"2021","journal-title":"IEEE Trans. Image Process."},{"key":"10.1016\/j.knosys.2026.116117_b43","doi-asserted-by":"crossref","unstructured":"S. Liu, L. Qi, H. Qin, J. Shi, J. Jia, Path aggregation network for instance segmentation, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR, 2018, pp. 8759\u20138768.","DOI":"10.1109\/CVPR.2018.00913"},{"key":"10.1016\/j.knosys.2026.116117_b44","doi-asserted-by":"crossref","first-page":"3029","DOI":"10.1109\/TIP.2021.3056887","article-title":"Pyramidal multiple instance detection network with mask guided self-correction for weakly supervised object detection","volume":"30","author":"Xu","year":"2021","journal-title":"IEEE Trans. Image Process."},{"key":"10.1016\/j.knosys.2026.116117_b45","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"14309","article-title":"H2FA R-CNN: Holistic and hierarchical feature alignment for cross-domain weakly supervised object detection","author":"Xu","year":"2022"},{"key":"10.1016\/j.knosys.2026.116117_b46","doi-asserted-by":"crossref","unstructured":"C. Yang, Y. Xu, J. Shi, B. Dai, B. Zhou, Temporal pyramid network for action recognition, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR, 2020, pp. 591\u2013600.","DOI":"10.1109\/CVPR42600.2020.00067"},{"key":"10.1016\/j.knosys.2026.116117_b47","doi-asserted-by":"crossref","unstructured":"X. Li, J. Li, W. Du, H. Chen, H. Yang, Learning interval-aware embedding for macro-and micro-expression spotting, in: Proceedings of the Asian Conference on Computer Vision, ACCV, 2024, pp. 337\u2013353.","DOI":"10.1007\/978-981-96-0911-6_22"},{"key":"10.1016\/j.knosys.2026.116117_b48","unstructured":"A. Vaswani, N. Shazeer, N. Parmar, J. Uszkoreit, L. Jones, A.N. Gomez, L. Kaiser, I. Polosukhin, Attention is All you Need, in: Advances in Neural Information Processing Systems (NeurIPS), 2017, pp. 5998\u20136008."},{"key":"10.1016\/j.knosys.2026.116117_b49","doi-asserted-by":"crossref","unstructured":"Y. Shi, N. Wang, X. Guo, YOLOV: Making Still Image Object Detectors Great at Video Object Detection, in: Proceedings of the AAAI Conference on Artificial Intelligence, AAAI, 2023, pp. 2254\u20132262.","DOI":"10.1609\/aaai.v37i2.25320"},{"key":"10.1016\/j.knosys.2026.116117_b50","doi-asserted-by":"crossref","unstructured":"S. An, S. Park, G. Kim, J. Baek, B. Lee, S. Kim, Context Enhanced Transformer for Single Image Object Detection in Video Data, in: Proceedings of the AAAI Conference on Artificial Intelligence, AAAI, 2024, pp. 682\u2013690.","DOI":"10.1609\/aaai.v38i2.27825"},{"key":"10.1016\/j.knosys.2026.116117_b51","doi-asserted-by":"crossref","first-page":"2109","DOI":"10.1109\/TMM.2023.3292615","article-title":"Class-aware dual-supervised aggregation network for video object detection","volume":"26","author":"Qi","year":"2024","journal-title":"IEEE Transactions on Multimedia"},{"key":"10.1016\/j.knosys.2026.116117_b52","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence","first-page":"6559","article-title":"Tgbformer: Transformer-GraphFormer blender network for video object detection","volume":"39","author":"Qi","year":"2025"},{"key":"10.1016\/j.knosys.2026.116117_b53","doi-asserted-by":"crossref","unstructured":"K.A. Hashmi, T.U. Sheikh, D. Stricker, M.Z. Afzal, Beyond Boxes: Mask-Guided Spatio-Temporal Feature Aggregation for Video Object Detection , in: 2025 IEEE\/CVF Winter Conference on Applications of Computer Vision, WACV, 2025, pp. 8122\u20138133.","DOI":"10.1109\/WACV61041.2025.00788"},{"issue":"3","key":"10.1016\/j.knosys.2026.116117_b54","doi-asserted-by":"crossref","DOI":"10.1007\/s11263-025-02700-3","article-title":"Practical video object detection via feature selection and aggregation","volume":"134","author":"Shi","year":"2026","journal-title":"Int. J. Comput. Vis."},{"key":"10.1016\/j.knosys.2026.116117_b55","doi-asserted-by":"crossref","unstructured":"K. He, X. Zhang, S. Ren, J. Sun, Deep Residual Learning for Image Recognition, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR, 2016, pp. 770\u2013778.","DOI":"10.1109\/CVPR.2016.90"},{"key":"10.1016\/j.knosys.2026.116117_b56","unstructured":"I. Loshchilov, F. Hutter, Decoupled Weight Decay Regularization, in: Proceedings of the International Conference on Learning Representations, ICLR, 2019."},{"key":"10.1016\/j.knosys.2026.116117_b57","doi-asserted-by":"crossref","DOI":"10.1016\/j.neucom.2025.131704","article-title":"IntSTR: An integrated spatio-temporal relation transformer for video object detection","volume":"658","author":"Zheng","year":"2025","journal-title":"Neurocomputing"},{"issue":"8","key":"10.1016\/j.knosys.2026.116117_b58","doi-asserted-by":"crossref","first-page":"2022","DOI":"10.1007\/s11263-022-01629-1","article-title":"Occluded video instance segmentation: A benchmark","volume":"130","author":"Qi","year":"2022","journal-title":"Int. J. Comput. Vis."},{"key":"10.1016\/j.knosys.2026.116117_b59","series-title":"2020 International Joint Conference on Neural Networks","first-page":"1","article-title":"Eigen-cam: Class activation map using principal components","author":"Muhammad","year":"2020"}],"container-title":["Knowledge-Based Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0950705126008439?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0950705126008439?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,5,29]],"date-time":"2026-05-29T01:13:59Z","timestamp":1780017239000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0950705126008439"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,7]]},"references-count":59,"alternative-id":["S0950705126008439"],"URL":"https:\/\/doi.org\/10.1016\/j.knosys.2026.116117","relation":{},"ISSN":["0950-7051"],"issn-type":[{"value":"0950-7051","type":"print"}],"subject":[],"published":{"date-parts":[[2026,7]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"StratiFormer: Stratified Temporal Transformer for video object detection","name":"articletitle","label":"Article Title"},{"value":"Knowledge-Based Systems","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.knosys.2026.116117","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier B.V. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"116117"}}