{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,2,21]],"date-time":"2025-02-21T07:28:24Z","timestamp":1740122904829,"version":"3.37.3"},"reference-count":38,"publisher":"Springer Science and Business Media LLC","issue":"9","license":[{"start":{"date-parts":[[2021,2,17]],"date-time":"2021-02-17T00:00:00Z","timestamp":1613520000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2021,2,17]],"date-time":"2021-02-17T00:00:00Z","timestamp":1613520000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimed Tools Appl"],"published-print":{"date-parts":[[2022,4]]},"DOI":"10.1007\/s11042-020-10357-y","type":"journal-article","created":{"date-parts":[[2021,2,19]],"date-time":"2021-02-19T11:12:08Z","timestamp":1613733128000},"page":"12047-12060","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":5,"title":["A central multimodal fusion framework for outdoor scene image segmentation"],"prefix":"10.1007","volume":"81","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-1950-137X","authenticated-orcid":false,"given":"Yifei","family":"Zhang","sequence":"first","affiliation":[]},{"given":"Olivier","family":"Morel","sequence":"additional","affiliation":[]},{"given":"Ralph","family":"Seulin","sequence":"additional","affiliation":[]},{"given":"Fabrice","family":"M\u00e9riaudeau","sequence":"additional","affiliation":[]},{"given":"D\u00e9sir\u00e9","family":"Sidib\u00e9","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2021,2,17]]},"reference":[{"key":"10357_CR1","doi-asserted-by":"publisher","unstructured":"Alghamdi A, Hammad M, Ugail H, Abdel-Raheem A, Muhammad K, Khalifa H, Abd El-Latif A (2020) Detection of myocardial infarction based on novel deep transfer learning methods for urban healthcare in smart cities. Multimedia Tools Appl https:\/\/doi.org\/10.1007\/s11042-020-08769-x","DOI":"10.1007\/s11042-020-08769-x"},{"key":"10357_CR2","unstructured":"Arevalo J, Solorio T, Montes-y G\u00f3mez M, Gonz\u00e1lez FA (2017) Gated multimodal units for information fusion. arXiv:1702.01992"},{"issue":"6","key":"10357_CR3","doi-asserted-by":"publisher","first-page":"345","DOI":"10.1007\/s00530-010-0182-0","volume":"16","author":"PK Atrey","year":"2010","unstructured":"Atrey PK, Hossain MA, El Saddik A, Kankanhalli MS (2010) Multimodal fusion for multimedia analysis: a survey. Multimedia Sys 16(6):345\u2013379","journal-title":"Multimedia Sys"},{"issue":"12","key":"10357_CR4","doi-asserted-by":"publisher","first-page":"2481","DOI":"10.1109\/TPAMI.2016.2644615","volume":"39","author":"V Badrinarayanan","year":"2017","unstructured":"Badrinarayanan V, Kendall A, Cipolla R (2017) Segnet: a deep convolutional encoder-decoder architecture for image segmentation. IEEE Trans Patt Anal Mach Intell 39(12):2481\u20132495","journal-title":"IEEE Trans Patt Anal Mach Intell"},{"key":"10357_CR5","doi-asserted-by":"crossref","unstructured":"Ben-Younes H, Cadene R, Cord M, Thome N (2017) Mutan: Multimodal tucker fusion for visual question answering. In: Proceedings of the IEEE international conference on computer vision, pp 2612\u20132620","DOI":"10.1109\/ICCV.2017.285"},{"key":"10357_CR6","doi-asserted-by":"crossref","unstructured":"Blanchon M, Morel O, Zhang Y, Seulin R, Crombez N, Sidib\u00e9 D (2019) Outdoor scenes pixel-wise semantic segmentation using polarimetry and fully convolutional network. In: 14th international conference on computer vision theory and applications (VISAPP 2019). Prague, Czech Republic. https:\/\/hal-univ-bourgogne.archives-ouvertes.fr\/hal-02024107","DOI":"10.5220\/0007360203280335"},{"key":"10357_CR7","doi-asserted-by":"crossref","unstructured":"Blum H, Gawel A, Siegwart R, Cadena C (2018) Modular sensor fusion for semantic segmentation. In: 2018 IEEE\/RSJ International conference on intelligent robots and systems (IROS). IEEE, pp 3670\u20133677","DOI":"10.1109\/IROS.2018.8593786"},{"key":"10357_CR8","doi-asserted-by":"crossref","unstructured":"Chen LC, Zhu Y, Papandreou G, Schroff F, Adam H (2018) Encoder-decoder with atrous separable convolution for semantic image segmentation. arXiv:1802.02611","DOI":"10.1007\/978-3-030-01234-2_49"},{"key":"10357_CR9","doi-asserted-by":"crossref","unstructured":"Cordts M, Omran M, Ramos S, Rehfeld T, Enzweiler M, Benenson R, Franke U, Roth S, Schiele B (2016) The cityscapes dataset for semantic urban scene understanding. In: Proceedings of the IEEE conference on computer vision and pattern recognition (CVPR)","DOI":"10.1109\/CVPR.2016.350"},{"key":"10357_CR10","doi-asserted-by":"crossref","unstructured":"Deng J, Dong W, Socher R, Li LJ, Li FF (2009) Imagenet: a large-scale hierarchical image database. In: IEEE conference on computer vision & pattern recognition","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"10357_CR11","unstructured":"Deng L, Yang M, Li T, He Y, Wang C (2019) Rfbnet: deep multimodal networks with residual fusion blocks for rgb-d semantic segmentation. arXiv:1907.00135"},{"key":"10357_CR12","doi-asserted-by":"crossref","unstructured":"Geiger A, Lenz P, Stiller C, Urtasun R (2013) Vision meets robotics: the kitti dataset. Int J Robot Res (IJRR)","DOI":"10.1177\/0278364913491297"},{"key":"10357_CR13","doi-asserted-by":"crossref","unstructured":"Gupta S, Girshick R, Arbel\u00e1ez P, Malik J (2014) Learning rich features from rgb-d images for object detection and segmentation. In: European conference on computer vision. Springer, pp 345\u2013360","DOI":"10.1007\/978-3-319-10584-0_23"},{"key":"10357_CR14","doi-asserted-by":"crossref","unstructured":"Harchanko JS, Chenault DB (2005) Water-surface object detection and classification using imaging polarimetry. In: Polarization science and remote sensing II, vol 5888. International Society for Optics and Photonics, p 588815","DOI":"10.1117\/12.623542"},{"key":"10357_CR15","doi-asserted-by":"crossref","unstructured":"Hazirbas C, Ma L, Domokos C, Cremers D (2016) Fusenet: incorporating depth into semantic segmentation via fusion-based cnn architecture. In: Asian conference on computer vision. Springer, pp 213\u2013228","DOI":"10.1007\/978-3-319-54181-5_14"},{"issue":"1","key":"10357_CR16","doi-asserted-by":"publisher","first-page":"79","DOI":"10.1162\/neco.1991.3.1.79","volume":"3","author":"RA Jacobs","year":"1991","unstructured":"Jacobs RA, Jordan MI, Nowlan SJ, Hinton GE, et al. (1991) Adaptive mixtures of local experts. Neural Comput 3(1):79\u201387","journal-title":"Neural Comput"},{"key":"10357_CR17","unstructured":"Jiang J, Zheng L, Luo F, Zhang Z (2018) Rednet: residual encoder-decoder network for indoor rgb-d semantic segmentation. arXiv:1806.01054"},{"key":"10357_CR18","doi-asserted-by":"crossref","unstructured":"Kaymak \u00c7, U\u00e7ar A (2019) A brief survey and an application of semantic image segmentation for autonomous driving. In: Handbook of deep learning applications. Springer, pp 161\u2013200","DOI":"10.1007\/978-3-030-11479-4_9"},{"issue":"9","key":"10357_CR19","doi-asserted-by":"publisher","first-page":"1449","DOI":"10.1109\/JPROC.2015.2460697","volume":"103","author":"D Lahat","year":"2015","unstructured":"Lahat D, Adali T, Jutten C (2015) Multimodal data fusion: an overview of methods, challenges, and prospects. Proc IEEE 103(9):1449\u20131477","journal-title":"Proc IEEE"},{"key":"10357_CR20","doi-asserted-by":"crossref","unstructured":"Li Y, Zhang J, Cheng Y, Huang K, Tan T (2017) Semantics-guided multi-level rgb-d feature fusion for indoor semantic segmentation. In: 2017 IEEE International conference on image processing (ICIP). IEEE, pp 1262\u20131266","DOI":"10.1109\/ICIP.2017.8296484"},{"key":"10357_CR21","doi-asserted-by":"crossref","unstructured":"Li Z, Gan Y, Liang X, Yu Y, Cheng H, Lin L (2016) Lstm-cf: Unifying context modeling and fusion with lstms for rgb-d scene labeling. In: European conference on computer vision. Springer, pp 541\u2013557","DOI":"10.1007\/978-3-319-46475-6_34"},{"key":"10357_CR22","unstructured":"Lin M, Chen Q, Yan S (2013) Network in network. arXiv:1312.4400"},{"key":"10357_CR23","doi-asserted-by":"crossref","unstructured":"Long J, Shelhamer E, Darrell T (2015) Fully convolutional networks for semantic segmentation. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 3431\u20133440","DOI":"10.1109\/CVPR.2015.7298965"},{"key":"10357_CR24","doi-asserted-by":"crossref","unstructured":"Lu X, Wang W, Ma C, Shen J, Shao L, Porikli F (2019) See more know more: unsupervised video object segmentation with co-attention siamese networks","DOI":"10.1109\/CVPR.2019.00374"},{"key":"10357_CR25","unstructured":"Oberweger M, Wohlhart P, Lepetit V (2015) Hands deep in deep learning for hand pose estimation. arXiv:1502.06807"},{"key":"10357_CR26","unstructured":"Park SJ, Hong KS, Lee S (2017) Rdfnet: Rgb-d multi-level residual feature fusion for indoor semantic segmentation. In: Proceedings of the IEEE international conference on computer vision, pp 4980\u20134989"},{"key":"10357_CR27","unstructured":"Paszke A, Chaurasia A, Kim S, Culurciello E (2016) Enet: a deep neural network architecture for real-time semantic segmentation. arXiv:1606.02147"},{"key":"10357_CR28","unstructured":"Paszke A, Gross S, Chintala S, Chanan G, Yang E, DeVito Z, Lin Z, Desmaison A, Antiga L, Lerer A (2017) Automatic differentiation in pytorch"},{"issue":"6","key":"10357_CR29","doi-asserted-by":"publisher","first-page":"96","DOI":"10.1109\/MSP.2017.2738401","volume":"34","author":"D Ramachandram","year":"2017","unstructured":"Ramachandram D, Taylor GW (2017) Deep multimodal learning: a survey on recent advances and trends. IEEE Signal Proc Mag 34(6):96\u2013108","journal-title":"IEEE Signal Proc Mag"},{"key":"10357_CR30","doi-asserted-by":"crossref","unstructured":"Ronneberger O, Fischer P, Brox T (2015) U-net: convolutional networks for biomedical image segmentation. In: International conference on medical image computing and computer-assisted intervention. Springer, pp 234\u2013241","DOI":"10.1007\/978-3-319-24574-4_28"},{"key":"10357_CR31","unstructured":"Simonyan K, Zisserman A (2014) Very deep convolutional networks for large-scale image recognition. Computer Ence"},{"key":"10357_CR32","doi-asserted-by":"crossref","unstructured":"Valada A, Mohan R, Burgard W (2018) Self-supervised model adaptation for multimodal semantic segmentation. arXiv:1808.03833","DOI":"10.1007\/s11263-019-01188-y"},{"key":"10357_CR33","doi-asserted-by":"crossref","unstructured":"Valada A, Oliveira G, Brox T, Burgard W (2016) Deep multispectral semantic scene understanding of forested environments using multimodal fusion. http:\/\/ais.informatik.uni-freiburg.de\/publications\/papers\/valada16iser.pdf","DOI":"10.1007\/978-3-319-50115-4_41"},{"key":"10357_CR34","doi-asserted-by":"crossref","unstructured":"Valada A, Vertens J, Dhall A, Burgard W (2017) Adapnet: adaptive semantic segmentation in adverse environmental conditions. In: Proceedings of the IEEE international conference on robotics and automation (ICRA). IEEE, pp 4644\u20134651","DOI":"10.1109\/ICRA.2017.7989540"},{"key":"10357_CR35","doi-asserted-by":"crossref","unstructured":"Vielzeuf V, Lechervy A, Pateux S, Jurie F (2018) Centralnet: a multilayer approach for multimodal fusion. In: Proceedings of the European conference on computer vision (ECCV), pp 0\u20130","DOI":"10.1007\/978-3-030-11024-6_44"},{"key":"10357_CR36","doi-asserted-by":"crossref","unstructured":"Wang W, Lu X, Shen J, Crandall D, Shao L (2019) Zero-shot video object segmentation via attentive graph neural networks","DOI":"10.1109\/ICCV.2019.00933"},{"key":"10357_CR37","unstructured":"Yu F, Koltun V (2015) Multi-scale context aggregation by dilated convolutions. arXiv:1511.07122"},{"key":"10357_CR38","doi-asserted-by":"crossref","unstructured":"Zhang Y, Morel O, Blanchon M, Seulin R, Rastgoo M, Sidib\u00e9 D (2019) Exploration of deep learning-based multimodal fusion for semantic road scene segmentation. In: VISAPP 2019 14Th international conference on computer vision theory and applications","DOI":"10.5220\/0007360403360343"}],"container-title":["Multimedia Tools and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-020-10357-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11042-020-10357-y\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-020-10357-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,4,13]],"date-time":"2022-04-13T19:49:24Z","timestamp":1649879364000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11042-020-10357-y"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,2,17]]},"references-count":38,"journal-issue":{"issue":"9","published-print":{"date-parts":[[2022,4]]}},"alternative-id":["10357"],"URL":"https:\/\/doi.org\/10.1007\/s11042-020-10357-y","relation":{},"ISSN":["1380-7501","1573-7721"],"issn-type":[{"type":"print","value":"1380-7501"},{"type":"electronic","value":"1573-7721"}],"subject":[],"published":{"date-parts":[[2021,2,17]]},"assertion":[{"value":"14 February 2020","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"22 September 2020","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"22 December 2020","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"17 February 2021","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}