{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,10]],"date-time":"2026-05-10T05:59:36Z","timestamp":1778392776464,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":61,"publisher":"ACM","license":[{"start":{"date-parts":[[2021,10,17]],"date-time":"2021-10-17T00:00:00Z","timestamp":1634428800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["No.61902247"],"award-info":[{"award-number":["No.61902247"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Shanghai Municipal Science and Technology Major Project","award":["2021SHZDZX0102"],"award-info":[{"award-number":["2021SHZDZX0102"]}]},{"DOI":"10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["No. 2019YFC1521104"],"award-info":[{"award-number":["No. 2019YFC1521104"]}],"id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Zhejiang Lab","award":["No. 2020NB0AB01"],"award-info":[{"award-number":["No. 2020NB0AB01"]}]},{"name":"National Key R&D Program of China","award":["2018AAA0100704"],"award-info":[{"award-number":["2018AAA0100704"]}]},{"name":"Shanghai Science and Technology RD Program of China","award":["20511100300"],"award-info":[{"award-number":["20511100300"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2021,10,17]]},"DOI":"10.1145\/3474085.3475285","type":"proceedings-article","created":{"date-parts":[[2021,10,18]],"date-time":"2021-10-18T17:45:27Z","timestamp":1634579127000},"page":"1507-1516","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":96,"title":["End-to-End Video Object Detection with Spatial-Temporal Transformers"],"prefix":"10.1145","author":[{"given":"Lu","family":"He","sequence":"first","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Qianyu","family":"Zhou","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xiangtai","family":"Li","sequence":"additional","affiliation":[{"name":"Peking University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Li","family":"Niu","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Guangliang","family":"Cheng","sequence":"additional","affiliation":[{"name":"Sensetime Research, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xiao","family":"Li","sequence":"additional","affiliation":[{"name":"Sensetime Research, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Wenxuan","family":"Liu","sequence":"additional","affiliation":[{"name":"University of California, Los Angeles, Los Angeles, CA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yunhai","family":"Tong","sequence":"additional","affiliation":[{"name":"Peking University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Lizhuang","family":"Ma","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Liqing","family":"Zhang","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2021,10,17]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"crossref","unstructured":"Hatem Belhassen Heng Zhang Virginie Fresse and El-Bay Bourennane. 2019. Improving Video Object Detection by Seq-Bbox Matching. In VISIGRAPP (5: VISAPP). 226--233.  Hatem Belhassen Heng Zhang Virginie Fresse and El-Bay Bourennane. 2019. Improving Video Object Detection by Seq-Bbox Matching. In VISIGRAPP (5: VISAPP). 226--233.","DOI":"10.5220\/0007260002260233"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"crossref","unstructured":"Gedas Bertasius Lorenzo Torresani and Jianbo Shi. 2018. Object Detection in Video with Spatiotemporal Sampling Networks. In ECCV. 342--357.  Gedas Bertasius Lorenzo Torresani and Jianbo Shi. 2018. Object Detection in Video with Spatiotemporal Sampling Networks. In ECCV. 342--357.","DOI":"10.1007\/978-3-030-01258-8_21"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00815"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"crossref","unstructured":"Yihong Chen Yue Cao Han Hu and Liwei Wang. 2020. Memory enhanced global-local aggregation for video object detection. In CVPR. 10337--10346.  Yihong Chen Yue Cao Han Hu and Liwei Wang. 2020. Memory enhanced global-local aggregation for video object detection. In CVPR. 10337--10346.","DOI":"10.1109\/CVPR42600.2020.01035"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.5555\/3157096.3157139"},{"key":"e_1_3_2_1_7_1","volume-title":"R-fcn: Object detection via region-based fully convolutional networks. arXiv preprint arXiv:1605.06409","author":"Dai Jifeng","year":"2016","unstructured":"Jifeng Dai , Yi Li , Kaiming He , and Jian Sun . 2016 b. R-fcn: Object detection via region-based fully convolutional networks. arXiv preprint arXiv:1605.06409 (2016). Jifeng Dai, Yi Li, Kaiming He, and Jian Sun. 2016b. R-fcn: Object detection via region-based fully convolutional networks. arXiv preprint arXiv:1605.06409 (2016)."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"crossref","unstructured":"Jifeng Dai Haozhi Qi Yuwen Xiong Yi Li Guodong Zhang Han Hu and Yichen Wei. 2017. Deformable convolutional networks. In ICCV.  Jifeng Dai Haozhi Qi Yuwen Xiong Yi Li Guodong Zhang Han Hu and Yichen Wei. 2017. Deformable convolutional networks. In ICCV.","DOI":"10.1109\/ICCV.2017.89"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"crossref","unstructured":"Hanming Deng Yang Hua Tao Song Zongpu Zhang Zhengui Xue Ruhui Ma Neil Robertson and Haibing Guan. 2019 a. Object Guided External Memory Network for Video Object Detection. In ICCV.  Hanming Deng Yang Hua Tao Song Zongpu Zhang Zhengui Xue Ruhui Ma Neil Robertson and Haibing Guan. 2019 a. Object Guided External Memory Network for Video Object Detection. In ICCV.","DOI":"10.1109\/ICCV.2019.00678"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"crossref","unstructured":"Jiajun Deng Yingwei Pan Ting Yao Wengang Zhou Houqiang Li and Tao Mei. 2019 b. Relation Distillation Networks for Video Object Detection. In ICCV.  Jiajun Deng Yingwei Pan Ting Yao Wengang Zhou Houqiang Li and Tao Mei. 2019 b. Relation Distillation Networks for Video Object Detection. In ICCV.","DOI":"10.1109\/ICCV.2019.00712"},{"key":"e_1_3_2_1_12_1","unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly etal 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020).  Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly et al. 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"crossref","unstructured":"Christoph Feichtenhofer Axel Pinz and Andrew Zisserman. 2017. Detect to Track and Track to Detect. In ICCV. 3057--3065.  Christoph Feichtenhofer Axel Pinz and Andrew Zisserman. 2017. Detect to Track and Track to Detect. In ICCV. 3057--3065.","DOI":"10.1109\/ICCV.2017.330"},{"key":"e_1_3_2_1_14_1","unstructured":"J. Feng S. Li X. Li F. Wu Q. Tian M. H. Yang and H. Ling. 2020. TapLab: A Fast Framework for Semantic Video Segmentation Tapping into Compressed-Domain Knowledge. (2020).  J. Feng S. Li X. Li F. Wu Q. Tian M. H. Yang and H. Ling. 2020. TapLab: A Fast Framework for Semantic Video Segmentation Tapping into Compressed-Domain Knowledge. (2020)."},{"key":"e_1_3_2_1_15_1","volume-title":"Proceedings of the thirteenth international conference on artificial intelligence and statistics. 249--256","author":"Glorot Xavier","year":"2010","unstructured":"Xavier Glorot and Yoshua Bengio . 2010 . Understanding the difficulty of training deep feedforward neural networks . In Proceedings of the thirteenth international conference on artificial intelligence and statistics. 249--256 . Xavier Glorot and Yoshua Bengio. 2010. Understanding the difficulty of training deep feedforward neural networks. In Proceedings of the thirteenth international conference on artificial intelligence and statistics. 249--256."},{"key":"e_1_3_2_1_16_1","unstructured":"Chaoxu Guo Bin Fan Jie Gu Qian Zhang Shiming Xiang Veronique Prinet and Chunhong Pan. 2019. Progressive sparse local attention for video object detection. In ICCV. 3909--3918.  Chaoxu Guo Bin Fan Jie Gu Qian Zhang Shiming Xiang Veronique Prinet and Chunhong Pan. 2019. Progressive sparse local attention for video object detection. In ICCV. 3909--3918."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413927"},{"key":"e_1_3_2_1_18_1","volume-title":"Mining Inter-Video Proposal Relations for Video Object Detection. In European Conference on Computer Vision. Springer, 431--446","author":"Han Mingfei","year":"2020","unstructured":"Mingfei Han , Yali Wang , Xiaojun Chang , and Yu Qiao . 2020 a . Mining Inter-Video Proposal Relations for Video Object Detection. In European Conference on Computer Vision. Springer, 431--446 . Mingfei Han, Yali Wang, Xiaojun Chang, and Yu Qiao. 2020 a. Mining Inter-Video Proposal Relations for Video Object Detection. In European Conference on Computer Vision. Springer, 431--446."},{"key":"e_1_3_2_1_19_1","volume-title":"Prajit Ramachandran, Mohammad Babaeizadeh, Honghui Shi, Jianan Li, Shuicheng Yan, and Thomas S Huang.","author":"Han Wei","year":"2016","unstructured":"Wei Han , Pooya Khorrami , Tom Le Paine , Prajit Ramachandran, Mohammad Babaeizadeh, Honghui Shi, Jianan Li, Shuicheng Yan, and Thomas S Huang. 2016 . Seq-nms for video object detection. arXiv preprint arXiv:1602.08465 (2016). Wei Han, Pooya Khorrami, Tom Le Paine, Prajit Ramachandran, Mohammad Babaeizadeh, Honghui Shi, Jianan Li, Shuicheng Yan, and Thomas S Huang. 2016. Seq-nms for video object detection. arXiv preprint arXiv:1602.08465 (2016)."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6727"},{"key":"e_1_3_2_1_21_1","volume-title":"Piotr Doll\u00e1 r, and Ross B. Girshick","author":"He Kaiming","year":"2017","unstructured":"Kaiming He , Georgia Gkioxari , Piotr Doll\u00e1 r, and Ross B. Girshick . 2017 . Mask R-CNN. In ICCV. 2980--2988. Kaiming He, Georgia Gkioxari, Piotr Doll\u00e1 r, and Ross B. Girshick. 2017. Mask R-CNN. In ICCV. 2980--2988."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_24_1","volume-title":"Batch normalization: Accelerating deep network training by reducing internal covariate shift. arXiv preprint arXiv:1502.03167","author":"Ioffe Sergey","year":"2015","unstructured":"Sergey Ioffe and Christian Szegedy . 2015. Batch normalization: Accelerating deep network training by reducing internal covariate shift. arXiv preprint arXiv:1502.03167 ( 2015 ). Sergey Ioffe and Christian Szegedy. 2015. Batch normalization: Accelerating deep network training by reducing internal covariate shift. arXiv preprint arXiv:1502.03167 (2015)."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33018529"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58517-4_2"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2017.2736553"},{"key":"e_1_3_2_1_28_1","volume-title":"The Hungarian method for the assignment problem. Naval research logistics quarterly","author":"Kuhn Harold W","year":"1955","unstructured":"Harold W Kuhn . 1955. The Hungarian method for the assignment problem. Naval research logistics quarterly , Vol. 2 , 1--2 ( 1955 ), 83--97. Harold W Kuhn. 1955. The Hungarian method for the assignment problem. Naval research logistics quarterly, Vol. 2, 1--2 (1955), 83--97."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413583"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.106"},{"key":"e_1_3_2_1_31_1","unstructured":"Tsung-Yi Lin Priya Goyal Ross Girshick Kaiming He and Piotr Dollar. 2017b. Focal Loss for Dense Object Detection. In ICCV.  Tsung-Yi Lin Priya Goyal Ross Girshick Kaiming He and Piotr Dollar. 2017b. Focal Loss for Dense Object Detection. In ICCV."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46448-0_2"},{"key":"e_1_3_2_1_34_1","volume-title":"Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101","author":"Loshchilov Ilya","year":"2017","unstructured":"Ilya Loshchilov and Frank Hutter . 2017. Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101 ( 2017 ). Ilya Loshchilov and Frank Hutter. 2017. Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101 (2017)."},{"key":"e_1_3_2_1_35_1","volume-title":"Object Detection in Video with Spatial-temporal Context Aggregation. arXiv preprint arXiv:1907.04988","author":"Luo Hao","year":"2019","unstructured":"Hao Luo , Lichao Huang , Han Shen , Yuan Li , Chang Huang , and Xinggang Wang . 2019. Object Detection in Video with Spatial-temporal Context Aggregation. arXiv preprint arXiv:1907.04988 ( 2019 ). Hao Luo, Lichao Huang, Han Shen, Yuan Li, Chang Huang, and Xinggang Wang. 2019. Object Detection in Video with Spatial-temporal Context Aggregation. arXiv preprint arXiv:1907.04988 (2019)."},{"key":"e_1_3_2_1_36_1","volume-title":"TrackFormer: Multi-Object Tracking with Transformers. arXiv preprint arXiv:2101.02702","author":"Meinhardt Tim","year":"2021","unstructured":"Tim Meinhardt , Alexander Kirillov , Laura Leal-Taixe , and Christoph Feichtenhofer . 2021. TrackFormer: Multi-Object Tracking with Transformers. arXiv preprint arXiv:2101.02702 ( 2021 ). Tim Meinhardt, Alexander Kirillov, Laura Leal-Taixe, and Christoph Feichtenhofer. 2021. TrackFormer: Multi-Object Tracking with Transformers. arXiv preprint arXiv:2101.02702 (2021)."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2016.2577031"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.5555\/2969239.2969250"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"crossref","unstructured":"Hamid Rezatofighi Nathan Tsoi JunYoung Gwak Amir Sadeghian Ian Reid and Silvio Savarese. 2019. Generalized Intersection Over Union: A Metric and a Loss for Bounding Box Regression. In CVPR.  Hamid Rezatofighi Nathan Tsoi JunYoung Gwak Amir Sadeghian Ian Reid and Silvio Savarese. 2019. Generalized Intersection Over Union: A Metric and a Loss for Bounding Box Regression. In CVPR.","DOI":"10.1109\/CVPR.2019.00075"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-015-0816-y"},{"key":"e_1_3_2_1_41_1","volume-title":"Robust and efficient post-processing for video object detection. arXiv preprint arXiv:2009.11050","author":"Sabater Alberto","year":"2020","unstructured":"Alberto Sabater , Luis Montesano , and Ana C Murillo . 2020. Robust and efficient post-processing for video object detection. arXiv preprint arXiv:2009.11050 ( 2020 ). Alberto Sabater, Luis Montesano, and Ana C Murillo. 2020. Robust and efficient post-processing for video object detection. arXiv preprint arXiv:2009.11050 (2020)."},{"key":"e_1_3_2_1_42_1","volume-title":"Berg","author":"Shvets Mykhailo","year":"2019","unstructured":"Mykhailo Shvets , Wei Liu , and Alexander C . Berg . 2019 . Leveraging Long-Range Temporal Relationships Between Proposals for Video Object Detection. In ICCV. 9755--9763. Mykhailo Shvets, Wei Liu, and Alexander C. Berg. 2019. Leveraging Long-Range Temporal Relationships Between Proposals for Video Object Detection. In ICCV. 9755--9763."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.255"},{"key":"e_1_3_2_1_44_1","volume-title":"MAMBA: Multi-level Aggregation via Memory Bank for Video Object Detection. In AAAI.","author":"Sun Guanxiong","year":"2021","unstructured":"Guanxiong Sun , Yang Hua , Guosheng Hu , and Neil Robertson . 2021 . MAMBA: Multi-level Aggregation via Memory Bank for Video Object Detection. In AAAI. Guanxiong Sun, Yang Hua, Guosheng Hu, and Neil Robertson. 2021. MAMBA: Multi-level Aggregation via Memory Bank for Video Object Detection. In AAAI."},{"key":"e_1_3_2_1_45_1","volume-title":"2020 a. TransTrack: Multiple-Object Tracking with Transformer. arXiv preprint arXiv:2012.15460","author":"Sun Peize","year":"2020","unstructured":"Peize Sun , Yi Jiang , Rufeng Zhang , Enze Xie , Jinkun Cao , Xinting Hu , Tao Kong , Zehuan Yuan , Changhu Wang , and Ping Luo . 2020 a. TransTrack: Multiple-Object Tracking with Transformer. arXiv preprint arXiv:2012.15460 ( 2020 ). Peize Sun, Yi Jiang, Rufeng Zhang, Enze Xie, Jinkun Cao, Xinting Hu, Tao Kong, Zehuan Yuan, Changhu Wang, and Ping Luo. 2020 a. TransTrack: Multiple-Object Tracking with Transformer. arXiv preprint arXiv:2012.15460 (2020)."},{"key":"e_1_3_2_1_46_1","volume-title":"2020 b. Sparse r-cnn: End-to-end object detection with learnable proposals. arXiv preprint arXiv:2011.12450","author":"Sun Peize","year":"2020","unstructured":"Peize Sun , Rufeng Zhang , Yi Jiang , Tao Kong , Chenfeng Xu , Wei Zhan , Masayoshi Tomizuka , Lei Li , Zehuan Yuan , Changhu Wang , 2020 b. Sparse r-cnn: End-to-end object detection with learnable proposals. arXiv preprint arXiv:2011.12450 ( 2020 ). Peize Sun, Rufeng Zhang, Yi Jiang, Tao Kong, Chenfeng Xu, Wei Zhan, Masayoshi Tomizuka, Lei Li, Zehuan Yuan, Changhu Wang, et al. 2020 b. Sparse r-cnn: End-to-end object detection with learnable proposals. arXiv preprint arXiv:2011.12450 (2020)."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00972"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.5555\/3295222.3295349"},{"key":"e_1_3_2_1_49_1","volume-title":"Attention is all you need. arXiv preprint arXiv:1706.03762","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani , Noam Shazeer , Niki Parmar , Jakob Uszkoreit , Llion Jones , Aidan N Gomez , Lukasz Kaiser , and Illia Polosukhin . 2017b. Attention is all you need. arXiv preprint arXiv:1706.03762 ( 2017 ). Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, Lukasz Kaiser, and Illia Polosukhin. 2017b. Attention is all you need. arXiv preprint arXiv:1706.03762 (2017)."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"crossref","unstructured":"Shiyao Wang Yucong Zhou Junjie Yan and Zhidong Deng. 2018b. Fully Motion-Aware Network for Video Object Detection. In ECCV.  Shiyao Wang Yucong Zhou Junjie Yan and Zhidong Deng. 2018b. Fully Motion-Aware Network for Video Object Detection. In ECCV.","DOI":"10.1007\/978-3-030-01261-8_33"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00813"},{"key":"e_1_3_2_1_52_1","volume-title":"End-to-End Video Instance Segmentation with Transformers. arXiv preprint arXiv:2011.14503","author":"Wang Yuqing","year":"2020","unstructured":"Yuqing Wang , Zhaoliang Xu , Xinlong Wang , Chunhua Shen , Baoshan Cheng , Hao Shen , and Huaxia Xia . 2020. End-to-End Video Instance Segmentation with Transformers. arXiv preprint arXiv:2011.14503 ( 2020 ). Yuqing Wang, Zhaoliang Xu, Xinlong Wang, Chunhua Shen, Baoshan Cheng, Hao Shen, and Huaxia Xia. 2020. End-to-End Video Instance Segmentation with Transformers. arXiv preprint arXiv:2011.14503 (2020)."},{"key":"e_1_3_2_1_53_1","unstructured":"Haiping Wu Yuntao Chen Naiyan Wang and Zhaoxiang Zhang. 2019. Sequence Level Semantics Aggregation for Video Object Detection. In ICCV.  Haiping Wu Yuntao Chen Naiyan Wang and Zhaoxiang Zhang. 2019. Sequence Level Semantics Aggregation for Video Object Detection. In ICCV."},{"key":"e_1_3_2_1_54_1","volume-title":"CenterNet Heatmap Propagation for Real-Time Video Object Detection. In European Conference on Computer Vision. Springer, 220--234","author":"Xu Zhujun","year":"2020","unstructured":"Zhujun Xu , Emir Hrustic , and Damien Vivet . 2020 . CenterNet Heatmap Propagation for Real-Time Video Object Detection. In European Conference on Computer Vision. Springer, 220--234 . Zhujun Xu, Emir Hrustic, and Damien Vivet. 2020. CenterNet Heatmap Propagation for Real-Time Video Object Detection. In European Conference on Computer Vision. Springer, 220--234."},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"crossref","unstructured":"Linjie Yang Yuchen Fan and Ning Xu. 2019. Video instance segmentation. In ICCV.  Linjie Yang Yuchen Fan and Ning Xu. 2019. Video instance segmentation. In ICCV.","DOI":"10.1109\/ICCV.2019.00529"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58568-6_10"},{"key":"e_1_3_2_1_57_1","volume-title":"Objects as points. arXiv preprint arXiv:1904.07850","author":"Zhou Xingyi","year":"2019","unstructured":"Xingyi Zhou , Dequan Wang , and Philipp Kr\"ahenb \u00fchl . 2019. Objects as points. arXiv preprint arXiv:1904.07850 ( 2019 ). Xingyi Zhou, Dequan Wang, and Philipp Kr\"ahenb\u00fchl. 2019. Objects as points. arXiv preprint arXiv:1904.07850 (2019)."},{"key":"e_1_3_2_1_58_1","unstructured":"Xizhou Zhu Jifeng Dai Lu Yuan and Yichen Wei. 2018. Towards High Performance Video Object Detection. In CVPR. 7210--7218.  Xizhou Zhu Jifeng Dai Lu Yuan and Yichen Wei. 2018. Towards High Performance Video Object Detection. In CVPR. 7210--7218."},{"key":"e_1_3_2_1_59_1","volume-title":"Deformable DETR: Deformable Transformers for End-to-End Object Detection. arXiv preprint arXiv:2010.04159","author":"Zhu Xizhou","year":"2020","unstructured":"Xizhou Zhu , Weijie Su , Lewei Lu , Bin Li , Xiaogang Wang , and Jifeng Dai . 2020. Deformable DETR: Deformable Transformers for End-to-End Object Detection. arXiv preprint arXiv:2010.04159 ( 2020 ). Xizhou Zhu, Weijie Su, Lewei Lu, Bin Li, Xiaogang Wang, and Jifeng Dai. 2020. Deformable DETR: Deformable Transformers for End-to-End Object Detection. arXiv preprint arXiv:2010.04159 (2020)."},{"key":"e_1_3_2_1_60_1","unstructured":"Xizhou Zhu Yujie Wang Jifeng Dai Lu Yuan and Yichen Wei. 2017a. Flow-Guided Feature Aggregation for Video Object Detection. In ICCV.  Xizhou Zhu Yujie Wang Jifeng Dai Lu Yuan and Yichen Wei. 2017a. Flow-Guided Feature Aggregation for Video Object Detection. In ICCV."},{"key":"e_1_3_2_1_61_1","unstructured":"Xizhou Zhu Yuwen Xiong Jifeng Dai Lu Yuan and Yichen Wei. 2017b. Deep Feature Flow for Video Recognition. In CVPR.  Xizhou Zhu Yuwen Xiong Jifeng Dai Lu Yuan and Yichen Wei. 2017b. Deep Feature Flow for Video Recognition. In CVPR."}],"event":{"name":"MM '21: ACM Multimedia Conference","location":"Virtual Event China","acronym":"MM '21","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 29th ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3474085.3475285","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3474085.3475285","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T20:48:17Z","timestamp":1750193297000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3474085.3475285"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,10,17]]},"references-count":61,"alternative-id":["10.1145\/3474085.3475285","10.1145\/3474085"],"URL":"https:\/\/doi.org\/10.1145\/3474085.3475285","relation":{},"subject":[],"published":{"date-parts":[[2021,10,17]]},"assertion":[{"value":"2021-10-17","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}