{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,13]],"date-time":"2026-04-13T14:46:54Z","timestamp":1776091614402,"version":"3.50.1"},"publisher-location":"Cham","reference-count":36,"publisher":"Springer International Publishing","isbn-type":[{"value":"9783030208691","type":"print"},{"value":"9783030208707","type":"electronic"}],"license":[{"start":{"date-parts":[[2019,1,1]],"date-time":"2019-01-01T00:00:00Z","timestamp":1546300800000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2019]]},"DOI":"10.1007\/978-3-030-20870-7_6","type":"book-chapter","created":{"date-parts":[[2019,5,24]],"date-time":"2019-05-24T16:14:21Z","timestamp":1558714461000},"page":"90-106","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":35,"title":["Robust Deep Multi-modal Learning Based on Gated Information Fusion Network"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-6603-0083","authenticated-orcid":false,"given":"Jaekyum","family":"Kim","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2318-9128","authenticated-orcid":false,"given":"Junho","family":"Koh","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8266-4211","authenticated-orcid":false,"given":"Yecheol","family":"Kim","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1998-6132","authenticated-orcid":false,"given":"Jaehyung","family":"Choi","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3400-0493","authenticated-orcid":false,"given":"Youngbae","family":"Hwang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3733-0148","authenticated-orcid":false,"given":"Jun Won","family":"Choi","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2019,5,25]]},"reference":[{"key":"6_CR1","unstructured":"Arevalo, J., Solorio, T., Montes-y G\u00f3mez, M., Gonz\u00e1lez, F.A.: Gated multimodal units for information fusion. arXiv preprint arXiv:1702.01992 (2017)"},{"key":"6_CR2","doi-asserted-by":"publisher","first-page":"423","DOI":"10.1109\/TPAMI.2018.2798607","volume":"41","author":"T Baltru\u0161aitis","year":"2018","unstructured":"Baltru\u0161aitis, T., Ahuja, C., Morency, L.P.: Multimodal machine learning: a survey and taxonomy. IEEE Trans. Pattern Anal. Mach. Intell. 41, 423\u2013443 (2018)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"6_CR3","doi-asserted-by":"crossref","unstructured":"Chabot, F., Chaouch, M., Rabarisoa, J., Teuli\u00e8re, C., Chateau, T.: Deep manta: a coarse-to-fine many-task network for joint 2D and 3D vehicle analysis from monocular image. In: Proceedings of IEEE Conference on Computer Vision Pattern Recog (CVPR) (2017)","DOI":"10.1109\/CVPR.2017.198"},{"key":"6_CR4","doi-asserted-by":"crossref","unstructured":"Chen, X., Kundu, K., Zhang, Z., Ma, H., Fidler, S., Urtasun, R.: Monocular 3D object detection for autonomous driving. In: Proceedings of IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2016)","DOI":"10.1109\/CVPR.2016.236"},{"key":"6_CR5","unstructured":"Chen, X., et al.: 3D object proposals for accurate object class detection. In: Advance in Neural Information Processing Systems (2015)"},{"key":"6_CR6","doi-asserted-by":"crossref","unstructured":"Chen, X., Ma, H., Wan, J., Li, B., Xia, T.: Multi-view 3D object detection network for autonomous driving. In: Proceedings of IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2017)","DOI":"10.1109\/CVPR.2017.691"},{"key":"6_CR7","doi-asserted-by":"crossref","unstructured":"Eitel, A., Springenberg, J.T., Spinello, L., Riedmiller, M.A., Burgard, W.: Multimodal deep learning for robust RGB-D object recognition. In: Proceedings of IEEE\/RSJ Interernational Conference on Intelligent Robots and Systems (IROS) (2015)","DOI":"10.1109\/IROS.2015.7353446"},{"key":"6_CR8","doi-asserted-by":"crossref","unstructured":"Geiger, A., Lenz, P., Urtasun, R.: Are we ready for autonomous driving? The kitti vision benchmark suite. In: Proceedings of IEEE Confernce on Computer Vision and Pattern Recognition (CVPR) (2012)","DOI":"10.1109\/CVPR.2012.6248074"},{"key":"6_CR9","doi-asserted-by":"crossref","unstructured":"Girshick, R.: Fast R-CNN. In: Proceedings IEEE International Conference on Computer Vision (ICCV) (2015)","DOI":"10.1109\/ICCV.2015.169"},{"key":"6_CR10","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"345","DOI":"10.1007\/978-3-319-10584-0_23","volume-title":"Computer Vision \u2013 ECCV 2014","author":"S Gupta","year":"2014","unstructured":"Gupta, S., Girshick, R., Arbel\u00e1ez, P., Malik, J.: Learning rich features from RGB-D images for object detection and segmentation. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8695, pp. 345\u2013360. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10584-0_23"},{"key":"6_CR11","doi-asserted-by":"crossref","unstructured":"Gupta, S., Hoffman, J., Malik, J.: Cross modal distillation for supervision transfer. In: Proceedings of IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2016)","DOI":"10.1109\/CVPR.2016.309"},{"key":"6_CR12","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2016)","DOI":"10.1109\/CVPR.2016.90"},{"issue":"8","key":"6_CR13","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter, S., Schmidhuber, J.: Long short-term memory. Neural Comput. 9(8), 1735\u20131780 (1997)","journal-title":"Neural Comput."},{"key":"6_CR14","doi-asserted-by":"crossref","unstructured":"Hoffman, J., Gupta, S., Darrell, T.: Learning with side information through modality hallucination. In: Proceedings of IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2016)","DOI":"10.1109\/CVPR.2016.96"},{"key":"6_CR15","series-title":"Advances in Computer Vision and Pattern Recognition","doi-asserted-by":"publisher","first-page":"141","DOI":"10.1007\/978-1-4471-4640-7_8","volume-title":"Consumer Depth Cameras for Computer Vision","author":"A Janoch","year":"2013","unstructured":"Janoch, A., et al.: A category-level 3D object dataset: putting the kinect to work. In: Fossati, A., Gall, J., Grabner, H., Konolige, K., Ren, X. (eds.) Consumer Depth Cameras for Computer Vision. ACVPR, pp. 141\u2013165. Springer, London (2013). https:\/\/doi.org\/10.1007\/978-1-4471-4640-7_8"},{"key":"6_CR16","doi-asserted-by":"publisher","first-page":"99","DOI":"10.1007\/s12193-015-0195-2","volume":"10","author":"SE Kahou","year":"2015","unstructured":"Kahou, S.E., et al.: Emonets: multimodal deep learning approaches for emotion recognition in video. J. Multimodal User Interfaces 10, 99\u2013111 (2015)","journal-title":"J. Multimodal User Interfaces"},{"key":"6_CR17","doi-asserted-by":"crossref","unstructured":"Ku, J., Mozifian, M., Lee, J., Harakeh, A., Waslander, S.: Joint 3D proposal generation and object detection from view aggregation. arXiv preprint arXiv:1712.02294 (2017)","DOI":"10.1109\/IROS.2018.8594049"},{"key":"6_CR18","doi-asserted-by":"publisher","first-page":"436","DOI":"10.1038\/nature14539","volume":"521","author":"Y Lecun","year":"2015","unstructured":"Lecun, Y., Bengio, Y., Hinton, G.: Deep learning. Nature 521, 436\u2013444 (2015)","journal-title":"Nature"},{"key":"6_CR19","doi-asserted-by":"crossref","unstructured":"Li, Y., Zhang, J., Cheng, Y., Huang, K., Tan, T.: Semantics-guided multi-level RGB-D feature fusion for indoor semantic segmentation. In: 2017 IEEE International Conference on Image Processing (ICIP) (2017)","DOI":"10.1109\/ICIP.2017.8296484"},{"key":"6_CR20","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"21","DOI":"10.1007\/978-3-319-46448-0_2","volume-title":"Computer Vision \u2013 ECCV 2016","author":"W Liu","year":"2016","unstructured":"Liu, W., et al.: SSD: single shot multibox detector. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9905, pp. 21\u201337. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46448-0_2"},{"key":"6_CR21","doi-asserted-by":"crossref","unstructured":"Mroueh, Y., Marcheret, E., Goel, V.: Deep multimodal learning for audio-visual speech recognition. In: Proceedings of IEEE International Conference on Acoustics Speech and Signal Processing (ICASSP) (2015)","DOI":"10.1109\/ICASSP.2015.7178347"},{"key":"6_CR22","unstructured":"Ngiam, J., Khosla, A., Kim, M., Nam, J., Lee, H., Ng, A.Y.: Multimodal deep learning. In: Proceedings of International Conference on Machine Learning (ICML) (2011)"},{"issue":"4","key":"6_CR23","doi-asserted-by":"publisher","first-page":"722","DOI":"10.1007\/s10489-014-0629-7","volume":"42","author":"K Noda","year":"2015","unstructured":"Noda, K., Yamaguchi, Y., Nakadai, K., Okuno, H.G., Ogata, T.: Audio-visual speech recognition using deep learning. Appl. Intell. 42(4), 722\u2013737 (2015)","journal-title":"Appl. Intell."},{"key":"6_CR24","doi-asserted-by":"crossref","unstructured":"Poria, S., Cambria, E., Gelbukh, A.: Deep convolutional neural network textual features and multiple kernel learning for utterance-level multimodal sentiment analysis. In: Proceedings of Conference Empirical Methods in Natural Language Processing, pp. 2539\u20133544 (2015)","DOI":"10.18653\/v1\/D15-1303"},{"key":"6_CR25","doi-asserted-by":"crossref","unstructured":"Radu, V., Lane, N.D., Bhattacharya, S., Mascolo, C., Marina, M.K., Kawsar, F.: Towards multimodal deep learning for activity recognition on mobile devices. In: Proceedings of 2016 ACM Interernational Joint Confernce on Pervasive and Ubiquitous Computing, pp. 185\u2013188 (2016)","DOI":"10.1145\/2968219.2971461"},{"issue":"6","key":"6_CR26","doi-asserted-by":"publisher","first-page":"96","DOI":"10.1109\/MSP.2017.2738401","volume":"34","author":"D Ramachandram","year":"2017","unstructured":"Ramachandram, D., Taylor, G.W.: Deep multimodal learning. IEEE Signal Process. Mag. 34(6), 96\u2013108 (2017)","journal-title":"IEEE Signal Process. Mag."},{"key":"6_CR27","doi-asserted-by":"crossref","unstructured":"Redmon, J., Divvala, S., Girshick, R., Farhadi, A.: You only look once: unified, real-time object detection. In: Proceedings of IEEE Confernce on Computer Vision and Pattern Recognition (CVPR) (2016)","DOI":"10.1109\/CVPR.2016.91"},{"key":"6_CR28","doi-asserted-by":"crossref","unstructured":"Redmon, J., Farhadi, A.: Yolo9000: better, faster, stronger. In: Proceedings of IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2017)","DOI":"10.1109\/CVPR.2017.690"},{"key":"6_CR29","unstructured":"Ren, S., He, K., Girshick, R., Sun, J.: Faster R-CNN: Towards real-time object detection with region proposal networks. In: Advances in Neural Information Processing Systems (2015)"},{"key":"6_CR30","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"746","DOI":"10.1007\/978-3-642-33715-4_54","volume-title":"Computer Vision \u2013 ECCV 2012","author":"N Silberman","year":"2012","unstructured":"Silberman, N., Hoiem, D., Kohli, P., Fergus, R.: Indoor segmentation and support inference from RGBD images. In: Fitzgibbon, A., Lazebnik, S., Perona, P., Sato, Y., Schmid, C. (eds.) ECCV 2012. LNCS, vol. 7576, pp. 746\u2013760. Springer, Heidelberg (2012). https:\/\/doi.org\/10.1007\/978-3-642-33715-4_54"},{"key":"6_CR31","unstructured":"Simonyan, K., Zisserman, A.: Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556 (2014)"},{"key":"6_CR32","doi-asserted-by":"crossref","unstructured":"Song, S., Lichtenberg, S.P., Xiao, J.: Sun RGB-D: a RGB-D scene understanding benchmark suite. In: Proceedings of IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2015)","DOI":"10.1109\/CVPR.2015.7298655"},{"key":"6_CR33","first-page":"2949","volume":"15","author":"N Srivastava","year":"2014","unstructured":"Srivastava, N., Salakhutdinov, R.: Multimodal learning with deep boltzmann machines. J. Mach. Learn. Res. 15, 2949\u20132980 (2014)","journal-title":"J. Mach. Learn. Res."},{"key":"6_CR34","doi-asserted-by":"crossref","unstructured":"Xiao, J., Owens, A., Torralba, A.: Sun3D: a database of big spaces reconstructed using SFM and object labels. In: Proceedings of IEEE International Conference on Computer Vision (ICCV) (2013)","DOI":"10.1109\/ICCV.2013.458"},{"key":"6_CR35","doi-asserted-by":"crossref","unstructured":"Xu, D., Anguelov, D., Jain, A.: Pointfusion: deep sensor fusion for 3D bounding box estimation. In: Proceedings of IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2017)","DOI":"10.1109\/CVPR.2018.00033"},{"key":"6_CR36","doi-asserted-by":"publisher","first-page":"300","DOI":"10.1016\/j.patcog.2017.07.026","volume":"72","author":"X Xu","year":"2017","unstructured":"Xu, X., Li, Y., Wu, G., Luo, J.: Multi-modal deep feature learning for RGB-D object detection. Pattern Recogn. 72, 300\u2013313 (2017)","journal-title":"Pattern Recogn."}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ACCV 2018"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-030-20870-7_6","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2021,10,5]],"date-time":"2021-10-05T11:17:52Z","timestamp":1633432672000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-030-20870-7_6"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2019]]},"ISBN":["9783030208691","9783030208707"],"references-count":36,"URL":"https:\/\/doi.org\/10.1007\/978-3-030-20870-7_6","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2019]]},"assertion":[{"value":"25 May 2019","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ACCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Asian Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Perth, WA","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Australia","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2018","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2 December 2018","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"6 December 2018","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"14","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"accv2018","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/accv2018.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Microsoft CMT","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"979","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"274","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"28% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"2.7","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"No","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}