{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,21]],"date-time":"2026-05-21T16:19:08Z","timestamp":1779380348235,"version":"3.53.1"},"publisher-location":"Cham","reference-count":51,"publisher":"Springer International Publishing","isbn-type":[{"value":"9783030012366","type":"print"},{"value":"9783030012373","type":"electronic"}],"license":[{"start":{"date-parts":[[2018,1,1]],"date-time":"2018-01-01T00:00:00Z","timestamp":1514764800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2018,1,1]],"date-time":"2018-01-01T00:00:00Z","timestamp":1514764800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2018]]},"DOI":"10.1007\/978-3-030-01237-3_30","type":"book-chapter","created":{"date-parts":[[2018,10,6]],"date-time":"2018-10-06T18:42:18Z","timestamp":1538851338000},"page":"494-510","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":139,"title":["Video Object Detection with an Aligned Spatial-Temporal Memory"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-9839-1139","authenticated-orcid":false,"given":"Fanyi","family":"Xiao","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9863-1270","authenticated-orcid":false,"given":"Yong Jae","family":"Lee","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2018,10,7]]},"reference":[{"key":"30_CR1","unstructured":"http:\/\/image-net.org\/challenges\/LSVRC\/2015\/results#vid"},{"key":"30_CR2","unstructured":"Ba, J., Mnih, V., Kavukcuoglu, K.: Multiple object recognition with visual attention. arXiv preprint arXiv:1412.7755 (2014)"},{"key":"30_CR3","unstructured":"Ba, J.L., Kiros, J.R., Hinton, G.E.: Layer normalization. arXiv preprint arXiv:1607.06450 (2016)"},{"key":"30_CR4","unstructured":"Ballas, N., Yao, L., Pal, C., Courville, A.: Delving deeper into convolutional networks for learning video representations. In: ICLR (2016)"},{"issue":"3","key":"30_CR5","doi-asserted-by":"publisher","first-page":"500","DOI":"10.1109\/TPAMI.2010.143","volume":"33","author":"T Brox","year":"2011","unstructured":"Brox, T., Malik, J.: Large displacement optical flow: descriptor matching in variational motion estimation. PAMI 33(3), 500\u2013513 (2011)","journal-title":"PAMI"},{"key":"30_CR6","doi-asserted-by":"crossref","unstructured":"Carreira, J., Agrawal, P., Fragkiadaki, K., Malik, J.: Human pose estimation with iterative error feedback. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.512"},{"key":"30_CR7","doi-asserted-by":"crossref","unstructured":"Chen, X., Gupta, A.: Spatial memory for context reasoning in object detection. In: ICCV (2017)","DOI":"10.1109\/ICCV.2017.440"},{"issue":"4","key":"30_CR8","doi-asserted-by":"publisher","first-page":"271","DOI":"10.1016\/S0968-090X(98)00019-9","volume":"6C","author":"B Coifman","year":"1998","unstructured":"Coifman, B., Beymer, D., Mclauchlan, P., Malik, J.: A realtime computer vision system for vehicle tracking and traffic surveillance. Transp. Res. C 6C(4), 271\u2013288 (1998)","journal-title":"Transp. Res. C"},{"key":"30_CR9","unstructured":"Dai, J., Li, Y., He, K., Sun, J.: R-FCN: object detection via region-based fully convolutional networks. In: NIPS (2016)"},{"key":"30_CR10","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"428","DOI":"10.1007\/11744047_33","volume-title":"Computer Vision \u2013 ECCV 2006","author":"N Dalal","year":"2006","unstructured":"Dalal, N., Triggs, B., Schmid, C.: Human detection using oriented histograms of flow and appearance. In: Leonardis, A., Bischof, H., Pinz, A. (eds.) ECCV 2006. LNCS, vol. 3952, pp. 428\u2013441. Springer, Heidelberg (2006). https:\/\/doi.org\/10.1007\/11744047_33"},{"key":"30_CR11","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.J., Li, K., Fei-Fei, L.: ImageNet: a large-scale hierarchical image database. In: CVPR (2009)","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"30_CR12","doi-asserted-by":"crossref","unstructured":"Donahue, J., et al.: Long-term recurrent convolutional networks for visual recognition and description. In: CVPR (2015)","DOI":"10.21236\/ADA623249"},{"key":"30_CR13","doi-asserted-by":"crossref","unstructured":"Dosovitskiy, A., et al.: FlowNet: learning optical flow with convolutional networks. In: ICCV (2015)","DOI":"10.1109\/ICCV.2015.316"},{"key":"30_CR14","doi-asserted-by":"crossref","unstructured":"Feichtenhofer, C., Pinz, A., Zisserman, A.: Detect to track and track to detect. In: ICCV (2017)","DOI":"10.1109\/ICCV.2017.330"},{"key":"30_CR15","doi-asserted-by":"crossref","unstructured":"Fragkiadaki, K., Levine, S., Felsen, P., Malik, J.: Recurrent network models for human dynamics. In: ICCV (2015)","DOI":"10.1109\/ICCV.2015.494"},{"key":"30_CR16","doi-asserted-by":"crossref","unstructured":"Girshick, R., Donahue, J., Darrell, T., Malik, J.: Rich feature hierarchies for accurate object detection and semantic segmentation. In: CVPR (2014)","DOI":"10.1109\/CVPR.2014.81"},{"key":"30_CR17","doi-asserted-by":"crossref","unstructured":"Girshick, R.: Fast R-CNN. In: ICCV (2015)","DOI":"10.1109\/ICCV.2015.169"},{"key":"30_CR18","unstructured":"Han, P., Yuan, W., Lu, Z., Wen, J.R.: Video detection by learning with deep representation and spatio-temporal context (2015)"},{"key":"30_CR19","unstructured":"Han, W., et al.: Seq-NMS for video object detection. arXiv preprint arXiv:1602.08465 (2016)"},{"key":"30_CR20","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"30_CR21","doi-asserted-by":"crossref","unstructured":"Huang, X., Belongie, S.J.: Arbitrary style transfer in real-time with adaptive instance normalization. In: ICCV (2017)","DOI":"10.1109\/ICCV.2017.167"},{"key":"30_CR22","unstructured":"Ioffe, S., Szegedy, C.: Batch normalization: Accelerating deep network training by reducing internal covariate shift. arXiv preprint arXiv:1502.03167 (2015)"},{"key":"30_CR23","doi-asserted-by":"crossref","unstructured":"Jones, M., Snow, D.: Pedestrian detection using boosted features over many frames. In: ICPR (2008)","DOI":"10.1109\/ICPR.2008.4761703"},{"key":"30_CR24","doi-asserted-by":"crossref","unstructured":"Kang, K., et al.: Object detection in videos with tubelet proposal networks. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.101"},{"key":"30_CR25","unstructured":"Kang, K., et al.: T-CNN: tubelets with convolutional neural networks for object detection from videos. TCSVT (2017)"},{"key":"30_CR26","doi-asserted-by":"crossref","unstructured":"Karpathy, A., Toderici, G., Shetty, S., Leung, T., Sukthankar, R., Fei-Fei, L.: Large-scale video classification with convolutional neural networks. In: CVPR (2014)","DOI":"10.1109\/CVPR.2014.223"},{"key":"30_CR27","unstructured":"Kiros, R., Salakhutdinov, R., Zemel, R.S.: Unifying visual-semantic embeddings with multimodal neural language models. arXiv preprint arXiv:1411.2539 (2014)"},{"key":"30_CR28","unstructured":"Krizhevsky, A., Sutskever, I., Hinton, G.: ImageNet classification with deep convolutional neural networks. In: NIPS (2012)"},{"key":"30_CR29","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"68","DOI":"10.1007\/978-3-319-48881-3_6","volume-title":"Computer Vision \u2013 ECCV 2016 Workshops","author":"B Lee","year":"2016","unstructured":"Lee, B., Erdenee, E., Jin, S., Nam, M.Y., Jung, Y.G., Rhee, P.K.: Multi-class multi-object tracking using changing point detection. In: Hua, G., J\u00e9gou, H. (eds.) ECCV 2016. LNCS, vol. 9914, pp. 68\u201383. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-48881-3_6"},{"key":"30_CR30","doi-asserted-by":"crossref","unstructured":"Li, Y., Zhu, J., Hoi, S.C.: Reliable patch trackers: robust visual tracking by exploiting reliable patches. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7298632"},{"key":"30_CR31","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"21","DOI":"10.1007\/978-3-319-46448-0_2","volume-title":"Computer Vision \u2013 ECCV 2016","author":"W Liu","year":"2016","unstructured":"Liu, W., et al.: SSD: single shot multibox detector. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9905, pp. 21\u201337. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46448-0_2"},{"key":"30_CR32","unstructured":"Mnih, V., Heess, N., Graves, A., et al.: Recurrent models of visual attention. In: NIPS (2014)"},{"key":"30_CR33","doi-asserted-by":"crossref","unstructured":"Nam, H., Han, B.: Learning multi-domain convolutional neural networks for visual tracking. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.465"},{"key":"30_CR34","doi-asserted-by":"crossref","unstructured":"Ouyang, W., et al.: DeepID-Net: multi-stage and deformable deep convolutional neural networks for object detection. arXiv preprint arXiv:1409.3505 (2014)","DOI":"10.1109\/CVPR.2015.7298854"},{"key":"30_CR35","doi-asserted-by":"crossref","unstructured":"Park, D., Zitnick, C.L., Ramanan, D., Dollar, P.: Exploring weak stabilization for motion feature extraction. In: CVPR (2013)","DOI":"10.1109\/CVPR.2013.371"},{"key":"30_CR36","unstructured":"Pinheiro, P.O., Collobert, R., Dollar, P.: Learning to segment object candidates. In: NIPS (2015)"},{"key":"30_CR37","doi-asserted-by":"crossref","unstructured":"Redmon, J., Divvala, S., Girshick, R., Farhadi, A.: You only look once: unified, real-time object detection. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.91"},{"key":"30_CR38","unstructured":"Ren, S., He, K., Girshick, R., Sun, J.: Faster R-CNN: towards real-time object detection with region proposal networks. In: NIPS (2015)"},{"key":"30_CR39","unstructured":"Sermanet, P., Eigen, D., Zhang, X., Mathieu, M., Fergus, R., LeCun, Y.: OverFeat: integrated recognition, localization and detection using convolutional networks. In: ICLR (2014)"},{"key":"30_CR40","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"330","DOI":"10.1007\/978-3-319-46448-0_20","volume-title":"Computer Vision \u2013 ECCV 2016","author":"A Shrivastava","year":"2016","unstructured":"Shrivastava, A., Gupta, A.: Contextual priming and feedback for faster R-CNN. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9905, pp. 330\u2013348. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46448-0_20"},{"key":"30_CR41","doi-asserted-by":"crossref","unstructured":"Tokmakov, P., Alahari, K., Schmid, C.: Learning video object segmentation with visual memory. In: ICCV (2017)","DOI":"10.1109\/ICCV.2017.480"},{"key":"30_CR42","doi-asserted-by":"crossref","unstructured":"Tran, D., Bourdev, L., Fergus, R., Torresani, L., Paluri, M.: Learning spatiotemporal features with 3D convolutional networks. In: ICCV (2015)","DOI":"10.1109\/ICCV.2015.510"},{"key":"30_CR43","doi-asserted-by":"crossref","unstructured":"Tripathi, S., Lipton, Z.C., Belongie, S., Nguyen, T.: Context matters: Refining object detection in video with recurrent neural networks. arXiv preprint arXiv:1607.04648 (2016)","DOI":"10.5244\/C.30.44"},{"key":"30_CR44","doi-asserted-by":"crossref","unstructured":"Vinyals, O., Toshev, A., Bengio, S., Erhan, D.: Show and tell: a neural image caption generator. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7298935"},{"issue":"2","key":"30_CR45","doi-asserted-by":"publisher","first-page":"153","DOI":"10.1007\/s11263-005-6644-8","volume":"63","author":"P Viola","year":"2005","unstructured":"Viola, P., Jones, M., Snow, D.: Detecting pedestrians using patterns of motion and appearance. IJCV 63(2), 153\u2013161 (2005)","journal-title":"IJCV"},{"key":"30_CR46","doi-asserted-by":"publisher","first-page":"780","DOI":"10.1109\/34.598236","volume":"19","author":"C Wren","year":"1997","unstructured":"Wren, C., Azarbayejani, A., Darrell, T., Pentland, A.: Pfinder: real-time tracking of the human body. PAMI 19, 780\u2013785 (1997)","journal-title":"PAMI"},{"key":"30_CR47","unstructured":"Xu, K., et al.: Show, attend and tell: Neural image caption generation with visual attention. arXiv preprint arXiv:1502.03044 (2015)"},{"key":"30_CR48","doi-asserted-by":"crossref","unstructured":"Yang, B., Yan, J., Lei, Z., Li, S.Z.: Craft objects from images. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.650"},{"key":"30_CR49","doi-asserted-by":"crossref","unstructured":"Zheng, S., et al.: Conditional random fields as recurrent neural networks. In: CVPR (2015)","DOI":"10.1109\/ICCV.2015.179"},{"key":"30_CR50","doi-asserted-by":"crossref","unstructured":"Zhu, X., Dai, J., Yuan, L., Wei, Y.: Towards high performance video object detection. In: CVPR (2018)","DOI":"10.1109\/CVPR.2018.00753"},{"key":"30_CR51","doi-asserted-by":"crossref","unstructured":"Zhu, X., Wang, Y., Dai, J., Yuan, L., Wei, Y.: Flow-guided feature aggregation for video object detection. In: ICCV (2017)","DOI":"10.1109\/ICCV.2017.52"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2018"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-030-01237-3_30","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,10,6]],"date-time":"2022-10-06T00:14:21Z","timestamp":1665015261000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-030-01237-3_30"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018]]},"ISBN":["9783030012366","9783030012373"],"references-count":51,"URL":"https:\/\/doi.org\/10.1007\/978-3-030-01237-3_30","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2018]]},"assertion":[{"value":"7 October 2018","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Munich","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Germany","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2018","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"8 September 2018","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"14 September 2018","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"15","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2018","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2018.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"This content has been made available to all.","name":"free","label":"Free to read"}]}}