{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,28]],"date-time":"2026-04-28T23:57:12Z","timestamp":1777420632850,"version":"3.51.4"},"publisher-location":"Cham","reference-count":37,"publisher":"Springer International Publishing","isbn-type":[{"value":"9783030110178","type":"print"},{"value":"9783030110185","type":"electronic"}],"license":[{"start":{"date-parts":[[2019,1,1]],"date-time":"2019-01-01T00:00:00Z","timestamp":1546300800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2019,1,1]],"date-time":"2019-01-01T00:00:00Z","timestamp":1546300800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2019]]},"DOI":"10.1007\/978-3-030-11018-5_19","type":"book-chapter","created":{"date-parts":[[2019,1,24]],"date-time":"2019-01-24T05:50:50Z","timestamp":1548309050000},"page":"206-218","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":59,"title":["NeXtVLAD: An Efficient Neural Network to Aggregate Frame-Level Features for Large-Scale Video Classification"],"prefix":"10.1007","author":[{"given":"Rongcheng","family":"Lin","sequence":"first","affiliation":[]},{"given":"Jing","family":"Xiao","sequence":"additional","affiliation":[]},{"given":"Jianping","family":"Fan","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2019,1,23]]},"reference":[{"key":"19_CR1","unstructured":"Abu-El-Haija, S., et al.: Youtube-8m: a large-scale video classification benchmark. arXiv:1609.08675 (2016)"},{"key":"19_CR2","doi-asserted-by":"crossref","unstructured":"Arandjelovi\u0107, R., Gronat, P., Torii, A., Pajdla, T., Sivic, J.: NetVLAD: CNN architecture for weakly supervised place recognition. In: IEEE Conference on Computer Vision and Pattern Recognition (2016)","DOI":"10.1109\/CVPR.2016.572"},{"key":"19_CR3","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter, S., Schmidhuber, J.: Long short-term memory. Neural Comput. 9, 1735\u20131780 (1997)","journal-title":"Neural Comput."},{"key":"19_CR4","doi-asserted-by":"crossref","unstructured":"Cho, K., et al.: Learning phrase representations using RNN encoder-decoder for statistical machine translation. In: Moschitti, A., Pang, B., Daelemans, W. (eds.) EMNLP, ACL, pp. 1724\u20131734 (2014)","DOI":"10.3115\/v1\/D14-1179"},{"key":"19_CR5","unstructured":"Miech, A., Laptev, I., Sivic, J.: Learnable pooling with context gating for video classification. CoRR (2017)"},{"key":"19_CR6","doi-asserted-by":"crossref","unstructured":"Xie, S., Girshick, R.B., Doll\u00e1r, P., Tu, Z., He, K.: Aggregated residual transformations for deep neural networks. CoRR (2016)","DOI":"10.1109\/CVPR.2017.634"},{"key":"19_CR7","doi-asserted-by":"crossref","unstructured":"Sivic, J., Zisserman, A.: Video Google: a text retrieval approach to object matching in videos. In: IEEE International Conference on Computer Vision, vol. 2, pp. 1470\u20131477 (2003)","DOI":"10.1109\/ICCV.2003.1238663"},{"key":"19_CR8","doi-asserted-by":"crossref","unstructured":"Perronnin, F., Dance, C.R.: Fisher kernels on visual vocabularies for image categorization. In: CVPR IEEE Computer Society (2007)","DOI":"10.1109\/CVPR.2007.383266"},{"key":"19_CR9","doi-asserted-by":"crossref","unstructured":"Jegou, H., Douze, M., Schmid, C., P\u00e9rez, P.: Aggregating local descriptors into a compact image representation. In: CVPR IEEE Computer Society, pp. 3304\u20133311 (2010)","DOI":"10.1109\/CVPR.2010.5540039"},{"key":"19_CR10","doi-asserted-by":"crossref","unstructured":"Laptev, I., Marsza\u0142ek, M., Schmid, C., Rozenfeld, B.: Learning realistic human actions from movies. In: CVPR (2008)","DOI":"10.1109\/CVPR.2008.4587756"},{"key":"19_CR11","doi-asserted-by":"crossref","unstructured":"Schuldt, C., Laptev, I., Caputo, B.: Recognizing human actions: a local SVM approach. In: 17th International Conference on Proceedings of the Pattern Recognition, (ICPR 2004) Volume 3 - Volume 03. ICPR 2004, IEEE Computer Society, pp. 32\u201336 (2004)","DOI":"10.1109\/ICPR.2004.1334462"},{"key":"19_CR12","doi-asserted-by":"crossref","unstructured":"Girdhar, R., Ramanan, D., Gupta, A., Sivic, J., Russell, B.: ActionVLAD: learning spatio-temporal aggregation for action classification. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.337"},{"key":"19_CR13","doi-asserted-by":"crossref","unstructured":"Karpathy, A., Toderici, G., Shetty, S., Leung, T., Sukthankar, R., Fei-Fei, L.: Large-scale video classification with convolutional neural networks. In: CVPR (2014)","DOI":"10.1109\/CVPR.2014.223"},{"key":"19_CR14","doi-asserted-by":"crossref","unstructured":"Caba Heilbron, F., Escorcia, V., Ghanem, B., Carlos Niebles, J.: ActivityNet: a large-scale video benchmark for human activity understanding. In: The IEEE Conference on Computer Vision and Pattern Recognition (CVPR), June 2015","DOI":"10.1109\/CVPR.2015.7298698"},{"key":"19_CR15","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"29","DOI":"10.1007\/978-3-642-25446-8_4","volume-title":"Human Behavior Understanding","author":"M Baccouche","year":"2011","unstructured":"Baccouche, M., Mamalet, F., Wolf, C., Garcia, C., Baskurt, A.: Sequential deep learning for human action recognition. In: Salah, A.A., Lepri, B. (eds.) HBU 2011. LNCS, vol. 7065, pp. 29\u201339. Springer, Heidelberg (2011). https:\/\/doi.org\/10.1007\/978-3-642-25446-8_4"},{"key":"19_CR16","unstructured":"Simonyan, K., Zisserman, A.: Two-stream convolutional networks for action recognition in videos. In: Proceedings of the 27th International Conference on Neural Information Processing Systems - Volume 1. NIPS 2014, pp. 568\u2013576. MIT Press (2014)"},{"key":"19_CR17","doi-asserted-by":"publisher","first-page":"221","DOI":"10.1109\/TPAMI.2012.59","volume":"35","author":"S Ji","year":"2013","unstructured":"Ji, S., Xu, W., Yang, M., Yu, K.: 3D convolutional neural networks for human action recognition. IEEE Trans. Pattern Anal. Mach. Intell. 35, 221\u2013231 (2013)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"19_CR18","doi-asserted-by":"crossref","unstructured":"Tran, D., Bourdev, L., Fergus, R., Torresani, L., Paluri, M.: Learning spatiotemporal features with 3D convolutional networks. In: Proceedings of the 2015 IEEE International Conference on Computer Vision (ICCV). ICCV 2015, IEEE Computer Society, pp. 4489\u20134497 (2015)","DOI":"10.1109\/ICCV.2015.510"},{"key":"19_CR19","doi-asserted-by":"crossref","unstructured":"Feichtenhofer, C., Pinz, A., Zisserman, A.: Convolutional two-stream network fusion for video action recognition. CoRR (2016)","DOI":"10.1109\/CVPR.2016.213"},{"key":"19_CR20","unstructured":"Wu, Z., Jiang, Y., Wang, X., Ye, H., Xue, X., Wang, J.: Fusing multi-stream deep networks for video classification. CoRR (2015)"},{"key":"19_CR21","unstructured":"Ng, J.Y.H., Hausknecht, M., Vijayanarasimhan, S., Vinyals, O., Monga, R., Toderici, G.: Beyond short snippets: deep networks for video classification. In: Computer Vision and Pattern Recognition (2015)"},{"key":"19_CR22","unstructured":"Ballas, N., Yao, L., Pal, C., Courville, A.C.: Delving deeper into convolutional networks for learning video representations. CoRR (2015)"},{"key":"19_CR23","doi-asserted-by":"crossref","unstructured":"Fernando, B., Gavves, E., Oramas, J.M., Ghodrati, A., Tuytelaars, T.: Modeling video evolution for action recognition. In: The IEEE Conference on Computer Vision and Pattern Recognition (CVPR), June 2015","DOI":"10.1109\/CVPR.2015.7299176"},{"key":"19_CR24","doi-asserted-by":"crossref","unstructured":"Wang, X., Farhadi, A., Gupta, A.: Actions \u00a0 transformations. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.291"},{"key":"19_CR25","unstructured":"Bilen, H., Fernando, B., Gavves, E., Vedaldi, A.: Action recognition with dynamic image networks. CoRR (2016)"},{"key":"19_CR26","doi-asserted-by":"crossref","unstructured":"Wang, L., Li, W., Li, W., Gool, L.V.: Appearance-and-relation networks for video classification. Technical report, arXiv (2017)","DOI":"10.1109\/CVPR.2018.00155"},{"key":"19_CR27","doi-asserted-by":"crossref","unstructured":"Arandjelovic, R., Zisserman, A.: All about VLAD. In: Proceedings of the 2013 IEEE Conference on Computer Vision and Pattern Recognition, IEEE Computer Society, pp. 1578\u20131585 (2013)","DOI":"10.1109\/CVPR.2013.207"},{"key":"19_CR28","doi-asserted-by":"crossref","unstructured":"Hu, J., Shen, L., Sun, G.: Squeeze-and-excitation networks. In: CVPR (2018)","DOI":"10.1109\/CVPR.2018.00745"},{"key":"19_CR29","unstructured":"Hinton, G., Vinyals, O., Dean, J.: Distilling the knowledge in a neural network. In: NIPS Deep Learning and Representation Learning Workshop (2015)"},{"key":"19_CR30","doi-asserted-by":"crossref","unstructured":"Zhang, Y., Xiang, T., Hospedales, T.M., Lu, H.: Deep mutual learning. CoRR (2017)","DOI":"10.1109\/CVPR.2018.00454"},{"key":"19_CR31","doi-asserted-by":"crossref","unstructured":"Li, Z., Hoiem, D.: Learning without forgetting. CoRR (2016)","DOI":"10.1007\/978-3-319-46493-0_37"},{"key":"19_CR32","unstructured":"Lan, X., Zhu, X., Gong, S.: Knowledge distillation by on-the-fly native ensemble. arXiv:1806.04606 (2018)"},{"key":"19_CR33","unstructured":"Ioffe, S., Szegedy, C.: Batch normalization: accelerating deep network training by reducing internal covariate shift. In: Proceedings of the 32nd International Conference on International Conference on Machine Learning - Volume 37. ICML 2015, pp. 448\u2013456. JMLR.org (2015)"},{"key":"19_CR34","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li-jia, L., Li, K., Fei-fei, L.: ImageNet: a large-scale hierarchical image database. In: CVPR (2009)","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"19_CR35","doi-asserted-by":"crossref","unstructured":"Hershey, S., et al.: CNN architectures for large-scale audio classification. CoRR (2016)","DOI":"10.1109\/ICASSP.2017.7952132"},{"key":"19_CR36","unstructured":"Abadi, M., et al.: TensorFlow: a system for large-scale machine learning. In: Proceedings of the 12th USENIX Conference on Operating Systems Design and Implementation. OSDI 2016, USENIX Association, pp. 265\u2013283 (2016)"},{"key":"19_CR37","unstructured":"Kingma, D.P., Ba, J.: Adam: a method for stochastic optimization. CoRR (2014)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2018 Workshops"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-030-11018-5_19","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,1,22]],"date-time":"2023-01-22T01:20:24Z","timestamp":1674350424000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-030-11018-5_19"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2019]]},"ISBN":["9783030110178","9783030110185"],"references-count":37,"URL":"https:\/\/doi.org\/10.1007\/978-3-030-11018-5_19","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2019]]},"assertion":[{"value":"23 January 2019","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Munich","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Germany","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2018","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"8 September 2018","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"14 September 2018","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"15","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2018","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2018.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"This content has been made available to all.","name":"free","label":"Free to read"}]}}