{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,7,4]],"date-time":"2025-07-04T05:36:23Z","timestamp":1751607383874,"version":"3.40.3"},"publisher-location":"Cham","reference-count":29,"publisher":"Springer International Publishing","isbn-type":[{"type":"print","value":"9783030110178"},{"type":"electronic","value":"9783030110185"}],"license":[{"start":{"date-parts":[[2019,1,1]],"date-time":"2019-01-01T00:00:00Z","timestamp":1546300800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2019,1,1]],"date-time":"2019-01-01T00:00:00Z","timestamp":1546300800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2019]]},"DOI":"10.1007\/978-3-030-11018-5_26","type":"book-chapter","created":{"date-parts":[[2019,1,24]],"date-time":"2019-01-24T05:50:50Z","timestamp":1548309050000},"page":"287-296","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Towards Good Practices for Multi-modal Fusion in Large-Scale Video Classification"],"prefix":"10.1007","author":[{"given":"Jinlai","family":"Liu","sequence":"first","affiliation":[]},{"given":"Zehuan","family":"Yuan","sequence":"additional","affiliation":[]},{"given":"Changhu","family":"Wang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2019,1,23]]},"reference":[{"key":"26_CR1","unstructured":"Krizhevsky, A., Sutskever, I., Hinton, G.E.: Imagenet classification with deep convolutional neural networks. In: International Conference on Neural Information Processing Systems, pp. 1097\u20131105 (2012)"},{"key":"26_CR2","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: IEEE Conference on Computer Vision and Pattern Recognition, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"26_CR3","doi-asserted-by":"crossref","unstructured":"Szegedy, C., Vanhoucke, V., Ioffe, S., Shlens, J., Wojna, Z.: Rethinking the inception architecture for computer vision. In: IEEE Conference on Computer Vision and Pattern Recognition, pp. 2818\u20132826 (2016)","DOI":"10.1109\/CVPR.2016.308"},{"key":"26_CR4","unstructured":"Simonyan, K., Zisserman, A.: Two-stream convolutional networks for action recognition in videos. In: Advances in Neural Information Processing Systems, pp. 568\u2013576 (2014)"},{"key":"26_CR5","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"20","DOI":"10.1007\/978-3-319-46484-8_2","volume-title":"Computer Vision \u2013 ECCV 2016","author":"L Wang","year":"2016","unstructured":"Wang, L., et al.: Temporal segment networks: towards good practices for deep action recognition. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016, Part VIII. LNCS, vol. 9912, pp. 20\u201336. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46484-8_2"},{"key":"26_CR6","doi-asserted-by":"crossref","unstructured":"Girdhar, R., Ramanan, D., Gupta, A., Sivic, J., Russell, B.: ActionVLAD: Learning spatio-temporal aggregation for action classification. In: CVPR, pp. 3165\u20133174 (2017)","DOI":"10.1109\/CVPR.2017.337"},{"issue":"6","key":"26_CR7","doi-asserted-by":"publisher","first-page":"82","DOI":"10.1109\/MSP.2012.2205597","volume":"29","author":"G Hinton","year":"2012","unstructured":"Hinton, G., et al.: Deep neural networks for acoustic modeling in speech recognition: the shared views of four research groups. IEEE Signal Process. Mag. 29(6), 82\u201397 (2012)","journal-title":"IEEE Signal Process. Mag."},{"key":"26_CR8","doi-asserted-by":"crossref","unstructured":"Hershey, S., et al.: CNN architectures for large-scale audio classification. In: International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 131\u2013135 (2017)","DOI":"10.1109\/ICASSP.2017.7952132"},{"key":"26_CR9","unstructured":"Abu-El-Haija, S., et al.: Youtube-8m: A Large-scale Video Classification Benchmark (2016)"},{"issue":"8","key":"26_CR10","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter, S., Schmidhuber, J.: Long short-term memory. Neural Comput. 9(8), 1735\u20131780 (1997)","journal-title":"Neural Comput."},{"key":"26_CR11","doi-asserted-by":"crossref","unstructured":"Cho, K., Van Merri\u00ebnboer, B., Bahdanau, D., Bengio, Y.: On the properties of neural machine translation: Encoder-decoder approaches. arXiv preprint arXiv:1409.1259 (2014)","DOI":"10.3115\/v1\/W14-4012"},{"key":"26_CR12","doi-asserted-by":"crossref","unstructured":"Gao, Y., Beijbom, O., Zhang, N., Darrell, T.: Compact bilinear pooling. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 317\u2013326 (2015)","DOI":"10.1109\/CVPR.2016.41"},{"key":"26_CR13","unstructured":"Kim, J.H., On, K.W., Lim, W., Kim, J., Ha, J.W., Zhang, B.T.: Hadamard product for low-rank bilinear pooling. arXiv preprint arXiv:1610.04325 (2016)"},{"key":"26_CR14","doi-asserted-by":"crossref","unstructured":"Yu, Z., Yu, J., Fan, J., Tao, D.: Multi-modal factorized bilinear pooling with co-attention learning for visual question answering. In: IEEE International Conference on Computer Vision (ICCV), pp. 1839\u20131848 (2017)","DOI":"10.1109\/ICCV.2017.202"},{"key":"26_CR15","doi-asserted-by":"crossref","unstructured":"Arandjelovic, R., Gronat, P., Torii, A., Pajdla, T., Sivic, J.: NetVLAD: CNN architecture for weakly supervised place recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 5297\u20135307 (2016)","DOI":"10.1109\/CVPR.2016.572"},{"key":"26_CR16","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.J., Li, K., Fei-Fei, L.: Imagenet: a large-scale hierarchical image database. In: IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2009, pp. 248\u2013255. IEEE (2009)","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"26_CR17","doi-asserted-by":"crossref","unstructured":"Caba Heilbron, F., Escorcia, V., Ghanem, B., Carlos Niebles, J.: Activitynet: a large-scale video benchmark for human activity understanding. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 961\u2013970 (2015)","DOI":"10.1109\/CVPR.2015.7298698"},{"key":"26_CR18","doi-asserted-by":"crossref","unstructured":"Tran, D., Wang, H., Torresani, L., Ray, J., LeCun, Y., Paluri, M.: A closer look at spatiotemporal convolutions for action recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6450\u20136459 (2018)","DOI":"10.1109\/CVPR.2018.00675"},{"key":"26_CR19","doi-asserted-by":"crossref","unstructured":"Qiu, Z., Yao, T., Mei, T.: Learning spatio-temporal representation with pseudo-3d residual networks. In: ICCV, pp. 5534\u20135542 (2017)","DOI":"10.1109\/ICCV.2017.590"},{"key":"26_CR20","doi-asserted-by":"crossref","unstructured":"Carreira, J., Zisserman, A.: Quo vadis, action recognition? A new model and the kinetics dataset. In: IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 4724\u20134733. IEEE (2017)","DOI":"10.1109\/CVPR.2017.502"},{"key":"26_CR21","doi-asserted-by":"crossref","unstructured":"J\u00e9gou, H., Douze, M., Schmid, C., P\u00e9rez, P.: Aggregating local descriptors into a compact image representation. In: Computer Vision and Pattern Recognition, pp. 3304\u20133311 (2010)","DOI":"10.1109\/CVPR.2010.5540039"},{"key":"26_CR22","doi-asserted-by":"crossref","unstructured":"Perronnin, F., Dance, C.: Fisher kernels on visual vocabularies for image categorization. In: IEEE Conference on Computer Vision and Pattern Recognition, pp. 1\u20138 (2007)","DOI":"10.1109\/CVPR.2007.383266"},{"key":"26_CR23","doi-asserted-by":"crossref","unstructured":"Ben-Younes, H., Cadene, R., Cord, M., Thome, N.: Mutan: multimodal tucker fusion for visual question answering. In: Proceedings of the IEEE International Conference on Computer Vision, vol. 3 (2017)","DOI":"10.1109\/ICCV.2017.285"},{"key":"26_CR24","unstructured":"Miech, A., Laptev, I., Sivic, J.: Learnable Pooling with Context Gating for Video Classification, arXiv preprint arXiv:1706.06905 (2017)"},{"key":"26_CR25","unstructured":"Pirsiavash, H., Ramanan, D., Fowlkes, C.: Bilinear classifiers for visual recognition. In: International Conference on Neural Information Processing Systems, pp. 1482\u20131490 (2009)"},{"issue":"4","key":"26_CR26","first-page":"4694","volume":"16","author":"YH Ng","year":"2015","unstructured":"Ng, Y.H., Hausknecht, M., Vijayanarasimhan, S., Vinyals, O., Monga, R., Toderici, G.: Beyond short snippets. Deep Netw. Video Classif. 16(4), 4694\u20134702 (2015)","journal-title":"Deep Netw. Video Classif."},{"key":"26_CR27","doi-asserted-by":"crossref","unstructured":"Wang, H., Ullah, M.M., Kl\u00e4ser, A., Laptev, I., Schmid, C.: Evaluation of local spatio-temporal features for action recognition. In: Proceedings of the British Machine Vision Conference, BMVC 2009, London, UK, 7\u201310 September 2009 (2009)","DOI":"10.5244\/C.23.124"},{"key":"26_CR28","doi-asserted-by":"publisher","DOI":"10.1007\/978-1-4471-2097-1_113","volume-title":"Hierarchical Mixtures of Experts and the EM Algorithm","author":"MI Jordan","year":"1994","unstructured":"Jordan, M.I., Jacobs, R.A.: Hierarchical Mixtures of Experts and the EM Algorithm. Springer, London (1994). https:\/\/doi.org\/10.1007\/978-1-4471-2097-1_113"},{"key":"26_CR29","unstructured":"Kingma, D.P., Ba, J.: Adam: a method for stochastic optimization. arXiv preprint arXiv:1412.6980 (2014)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2018 Workshops"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-030-11018-5_26","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,1,22]],"date-time":"2023-01-22T01:21:49Z","timestamp":1674350509000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-030-11018-5_26"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2019]]},"ISBN":["9783030110178","9783030110185"],"references-count":29,"URL":"https:\/\/doi.org\/10.1007\/978-3-030-11018-5_26","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2019]]},"assertion":[{"value":"23 January 2019","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Munich","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Germany","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2018","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"8 September 2018","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"14 September 2018","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"15","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2018","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2018.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"This content has been made available to all.","name":"free","label":"Free to read"}]}}