{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,7]],"date-time":"2026-04-07T16:08:16Z","timestamp":1775578096024,"version":"3.50.1"},"reference-count":274,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"3","license":[{"start":{"date-parts":[[2020,3,1]],"date-time":"2020-03-01T00:00:00Z","timestamp":1583020800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2020,3,1]],"date-time":"2020-03-01T00:00:00Z","timestamp":1583020800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2020,3,1]],"date-time":"2020-03-01T00:00:00Z","timestamp":1583020800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"name":"Beijing Academy of Artificial Intelligence"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE J. Sel. Top. Signal Process."],"published-print":{"date-parts":[[2020,3]]},"DOI":"10.1109\/jstsp.2020.2987728","type":"journal-article","created":{"date-parts":[[2020,4,15]],"date-time":"2020-04-15T22:22:18Z","timestamp":1586989338000},"page":"478-493","source":"Crossref","is-referenced-by-count":401,"title":["Multimodal Intelligence: Representation Learning, Information Fusion, and Applications"],"prefix":"10.1109","volume":"14","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-7730-5131","authenticated-orcid":false,"given":"Chao","family":"Zhang","sequence":"first","affiliation":[]},{"given":"Zichao","family":"Yang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9463-9168","authenticated-orcid":false,"given":"Xiaodong","family":"He","sequence":"additional","affiliation":[]},{"given":"Li","family":"Deng","sequence":"additional","affiliation":[]}],"member":"263","reference":[{"key":"ref274","doi-asserted-by":"publisher","DOI":"10.1016\/j.patrec.2019.04.024"},{"key":"ref271","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46487-9_6"},{"key":"ref270","article-title":"A simple neural network module for relational reasoning","author":"santoro","year":"0"},{"key":"ref273","doi-asserted-by":"publisher","DOI":"10.1109\/TAFFC.2019.2928297"},{"key":"ref170","doi-asserted-by":"publisher","DOI":"10.1007\/3-540-45465-9_59"},{"key":"ref272","article-title":"Freebase data dumps","year":"0"},{"key":"ref172","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D16-1044"},{"key":"ref171","doi-asserted-by":"publisher","DOI":"10.1145\/2487575.2487591"},{"key":"ref174","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.202"},{"key":"ref173","article-title":"Hadamard product for low-rank bilinear pooling","author":"kim","year":"0"},{"key":"ref176","doi-asserted-by":"publisher","DOI":"10.1007\/BF02289464"},{"key":"ref175","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2018.2817340"},{"key":"ref178","doi-asserted-by":"publisher","DOI":"10.1137\/070690729"},{"key":"ref177","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.285"},{"key":"ref168","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D17-1115"},{"key":"ref169","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.41"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.303"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46475-6_5"},{"key":"ref33","article-title":"BERT: Pre-training of deep bidirectional transformers for language understanding","author":"devlin","year":"0","journal-title":"Minneapolis"},{"key":"ref32","article-title":"Improving language understanding by generative pre-training","author":"radford","year":"2018"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N18-1202"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D15-1166"},{"key":"ref267","article-title":"Neural-symbolic VQA: Disentangling reasoning from vision and language understanding","author":"yi","year":"0","journal-title":"Montreal"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1086"},{"key":"ref268","article-title":"Probabilistic neural-symbolic models for interpretable visual question answering","author":"vedantam","year":"0"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1007\/978-981-10-5209-5"},{"key":"ref269","article-title":"The neuro-symbolic concept learner: Interpreting scenes, words, and sentences from natural supervision","author":"mao","year":"0"},{"key":"ref35","article-title":"Perspectives on predictive power of multimodal deep learning: Surprises and future directions","author":"bengio","year":"2019","journal-title":"Handbook of Multimodal-Multisensor Interfaces"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1631\/FITEE.1700826"},{"key":"ref181","article-title":"Bilinear attention networks","author":"kim","year":"0","journal-title":"Montreal"},{"key":"ref180","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1209"},{"key":"ref185","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"ref184","article-title":"Unifying visual-semantic embeddings with multimodal neural language models","author":"kiros","year":"2014","journal-title":"arXiv 1411 2539"},{"key":"ref183","doi-asserted-by":"publisher","DOI":"10.1109\/MSP.2017.2741510"},{"key":"ref182","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00679"},{"key":"ref189","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.345"},{"key":"ref188","article-title":"Show, attend and tell: Neural image caption generation with visual attention","author":"xu","year":"0"},{"key":"ref187","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298856"},{"key":"ref186","article-title":"Deep captioning with multimodal recurrent neural networks (M-RNN)","author":"mao","year":"0","journal-title":"arXiv 1412 6632"},{"key":"ref28","article-title":"Sequence to sequence learning with neural networks","author":"sutskever","year":"0"},{"key":"ref27","article-title":"Neural machine translation by jointly learning to align and translate","author":"bahdanau","year":"0"},{"key":"ref179","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33018102"},{"key":"ref29","article-title":"Google's neural machine translation system: Bridging the gap between human and machine translation","author":"wu","year":"2016","journal-title":"arXiv 1609 08144"},{"key":"ref20","article-title":"Very deep convolutional networks for large-scale image recognition","author":"simonyan","year":"0","journal-title":"presented at the Int Conf Learn Theory"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298594"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.169"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.81"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2014.2383614"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2016.2577031"},{"key":"ref50","article-title":"Generative adversarial text to image synthesis","author":"reed","year":"0","journal-title":"New York"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00143"},{"key":"ref154","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46478-7_28"},{"key":"ref153","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.10"},{"key":"ref156","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00636"},{"key":"ref155","first-page":"2397","article-title":"Dynamic memory networks for visual and textual question answering","author":"xiong","year":"0","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref150","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.540"},{"key":"ref152","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.499"},{"key":"ref151","first-page":"2048","article-title":"Show, attend and tell: Neural image caption generation with visual attention","author":"xu","year":"0","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref146","article-title":"Efficient progressive neural architecture search","author":"p\u00e9rez-r\u00faa","year":"0","journal-title":"Cardiff"},{"key":"ref147","doi-asserted-by":"publisher","DOI":"10.1145\/2964284.2964297"},{"key":"ref148","article-title":"Neural machine translation by jointly learning to align and translate","author":"bahdanau","year":"0","journal-title":"San Diego"},{"key":"ref149","article-title":"Neural turing machines","author":"graves","year":"2014","journal-title":"arXiv 1410 5401"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2017.2648793"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1929"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1007\/11608288_66"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/MMSP.1999.793814"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2889052"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1121\/1.2229005"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/6046.865479"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00387"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"ref167","doi-asserted-by":"publisher","DOI":"10.1162\/089976600300015349"},{"key":"ref166","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.11"},{"key":"ref165","first-page":"361","article-title":"Multimodal residual learning for visual QA","author":"kim","year":"0","journal-title":"Proc Int Conf Neural Inf Process"},{"key":"ref164","article-title":"Gated multimodal units for information fusion","author":"arevalo","year":"0"},{"key":"ref163","article-title":"High-order attention models for visual question answering","author":"schwartz","year":"0"},{"key":"ref162","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2019.05.001"},{"key":"ref161","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00118"},{"key":"ref160","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.232"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1016\/j.neunet.2014.09.003"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1561\/2000000039"},{"key":"ref6","author":"goodfellow","year":"2016","journal-title":"Deep Learning"},{"key":"ref5","doi-asserted-by":"crossref","first-page":"436","DOI":"10.1038\/nature14539","article-title":"Deep learning","volume":"521","author":"lecun","year":"2015","journal-title":"Nature"},{"key":"ref8","article-title":"Binary coding of speech spectrograms using a deep autoencoder","author":"deng","year":"0"},{"key":"ref159","article-title":"Hierarchical question-image co-attention for visual question answering","author":"lu","year":"0"},{"key":"ref7","article-title":"Roles of pre-training and fine-tuning in context-dependent DBN-HMMs for real-world speech recognition","author":"yu","year":"0","journal-title":"Presented Int Conf Neural Inf Process Syst"},{"key":"ref49","article-title":"Attribute2Image: Conditional image generation from visual attributes","author":"yan","year":"0","journal-title":"Amsterdam"},{"key":"ref157","article-title":"Co-attending free-form regions and detections with multi-modal multiplicative feature embedding for visual question answering","author":"lu","year":"0","journal-title":"New Orleans"},{"key":"ref9","article-title":"An overview of deep-structured learning for information processing","author":"deng","year":"0","journal-title":"presented at the Asian-Pacific Signal Inf Annu Summit Conf"},{"key":"ref158","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01245"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1073\/pnas.1422953112"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.503"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.283"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.279"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.494"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.497"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.571"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00051"},{"key":"ref71","article-title":"Distributed representations of words and phrases and their compositionality","author":"mikolov","year":"0","journal-title":"Lake Tahoe"},{"key":"ref70","article-title":"Efficient estimation of word representations in vector space","author":"mikolov","year":"0","journal-title":"Scottadale"},{"key":"ref76","article-title":"Empirical evaluation of gated recurrent neural networks on sequence modeling","author":"chung","year":"2014","journal-title":"arXiv 1412 3555"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1162"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-015-0816-y"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1145\/2736277.2741667"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/N15-1092"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-3116"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1145\/3197517.3201357"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU46091.2019.9003983"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1400"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2013.50"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1145\/2505515.2505665"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1145\/2567948.2577348"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2016.2520371"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1038\/323533a0"},{"key":"ref69","first-page":"1137","article-title":"A neural probabilistic language model","volume":"3","author":"bengio","year":"2003","journal-title":"J Mach Learn Res"},{"key":"ref197","article-title":"DRAW: A recurrent neural network for image generation","author":"gregor","year":"0"},{"key":"ref198","article-title":"Generating images from captions with attention","author":"mansimov","year":"0"},{"key":"ref199","article-title":"Conditional generative adversarial nets","author":"mirza","year":"2014"},{"key":"ref193","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW.2016.61"},{"key":"ref194","first-page":"955","author":"gan","year":"0","journal-title":"Proc Conf Comput Vis and Pattern Recog"},{"key":"ref195","article-title":"Generating diverse and accurate visual captions by comparative adversarial learning","author":"li","year":"2018"},{"key":"ref196","article-title":"Generating sequences with recurrent neural networks","author":"graves","year":"0"},{"key":"ref95","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2016-488"},{"key":"ref94","article-title":"Transfer learning from speaker verification to multispeaker text-to-speech synthesis","author":"jia","year":"2019","journal-title":"arXiv 1806 04558"},{"key":"ref190","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.127"},{"key":"ref93","article-title":"WaveNet: A generative model for raw audio","author":"van den oord","year":"2016","journal-title":"arXiv 1609 03499"},{"key":"ref191","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01095"},{"key":"ref92","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU.2017.8269006"},{"key":"ref192","article-title":"MS-Celeb-1M: A dataset and benchmark for large-scale face recognition","author":"guo","year":"0"},{"key":"ref91","article-title":"A study of speaker adaptation for DNN-based speech synthesis","author":"wu","year":"0","journal-title":"Dresden"},{"key":"ref90","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8462628"},{"key":"ref98","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2010.2064307"},{"key":"ref99","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2013.6639211"},{"key":"ref96","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1353"},{"key":"ref97","article-title":"X-vectors meet emotions: A study on dependencies between emotion and speaker recognition","author":"pappagari","year":"2020","journal-title":"arXiv 2002 05039"},{"key":"ref82","article-title":"Skip-thought vectors","author":"kiros","year":"0"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/P15-1128"},{"key":"ref84","article-title":"Generative adversarial nets","author":"goodfellow","year":"0","journal-title":"Monteral"},{"key":"ref83","article-title":"Linguistic regularities in continuous space word representations","author":"mikolov","year":"0","journal-title":"Atlanta J"},{"key":"ref80","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/P14-2105"},{"key":"ref89","article-title":"A comparison of neural network feature transforms for speaker diarization","author":"yella","year":"0","journal-title":"Dresden"},{"key":"ref85","article-title":"Unsupervised representation learning with deep convolutional generative adversarial networks","author":"radford","year":"0","journal-title":"San Juan"},{"key":"ref86","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU.2011.6163922"},{"key":"ref87","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU.2013.6707705"},{"key":"ref88","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2014.6853591"},{"key":"ref200","article-title":"Deep generative image models using a laplacian pyramid of adversarial networks","author":"denton","year":"0"},{"key":"ref101","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2014.6854363"},{"key":"ref100","doi-asserted-by":"publisher","DOI":"10.1109\/ICSLP.1996.607769"},{"key":"ref209","article-title":"Progressive growing of GANs for improved quality, stability, and variation","author":"karras","year":"0"},{"key":"ref203","first-page":"6629","article-title":"GANs trained by a two time-scale update rule converge to a local Nash equilibrium","author":"heusel","year":"0","journal-title":"Proc Int Conf Neural Inf Process"},{"key":"ref204","first-page":"2642","article-title":"Conditional image synthesis with auxiliary classifier GANs","author":"odena","year":"0","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref201","article-title":"Generative adversarial text to image synthesis","author":"reed","year":"0"},{"key":"ref202","first-page":"2234","article-title":"Improved techniques for training GANs","author":"salimans","year":"0","journal-title":"Proc Int Conf Neural Inf Process"},{"key":"ref207","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2856256"},{"key":"ref208","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00595"},{"key":"ref205","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00649"},{"key":"ref206","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.629"},{"key":"ref211","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33013272"},{"key":"ref210","article-title":"TAC-GAN &#x2013; Text conditioned auxiliary classifier generative adversarial network","author":"dash","year":"0"},{"key":"ref212","article-title":"Cycle-consistency for robust visual question answering","author":"chen","year":"0"},{"key":"ref213","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00160"},{"key":"ref214","article-title":"Caltech-UCSD birds 200","author":"welinder","year":"2010"},{"key":"ref215","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2006.42"},{"key":"ref216","first-page":"740","article-title":"Microsoft COCO: Common objects in context","author":"lin","year":"0","journal-title":"Proc Eur Conf Comput Vision"},{"key":"ref217","article-title":"Learning what and where to draw","author":"reed","year":"0"},{"key":"ref218","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00133"},{"key":"ref219","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00878"},{"key":"ref220","article-title":"Generating multiple objects at spatially distinct locations","author":"hinz","year":"0"},{"key":"ref222","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46493-0_47"},{"key":"ref221","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00833"},{"key":"ref229","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N19-4024"},{"key":"ref228","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.01040"},{"key":"ref227","article-title":"ChatPainter: Improving text to image generation using dialogue","author":"sharma","year":"0","journal-title":"Vancouver"},{"key":"ref226","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00687"},{"key":"ref225","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00766"},{"key":"ref224","article-title":"Text-adaptive generative adversarial networks: Manipulating images with natural language","author":"nam","year":"0"},{"key":"ref223","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2019.2916751"},{"key":"ref127","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.13"},{"key":"ref126","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.483"},{"key":"ref125","doi-asserted-by":"publisher","DOI":"10.1109\/WACV45572.2020.9093453"},{"key":"ref124","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.386"},{"key":"ref129","article-title":"Unicoder-VL: A universal encoder for vision and language by cross-modal pre-training","author":"li","year":"2019","journal-title":"arXiv 1908 06066"},{"key":"ref128","article-title":"Attention is all you need","author":"vaswani","year":"0","journal-title":"IEEE Long Beach"},{"key":"ref130","article-title":"VL-BERT: Pre-training of generic visuallinguistic representations","author":"su","year":"2019","journal-title":"arXiv 1908 08530"},{"key":"ref133","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1219"},{"key":"ref134","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1514"},{"key":"ref131","article-title":"VisualBERT: A simple and performant baseline for vision and language","author":"li","year":"2019","journal-title":"arXiv 1908 03557"},{"key":"ref132","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00756"},{"key":"ref232","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00649"},{"key":"ref233","article-title":"Video generation from text","author":"li","year":"0"},{"key":"ref230","article-title":"Sequential attention GAN for interactive image editing via dialogue","author":"chen","year":"0"},{"key":"ref231","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1651"},{"key":"ref239","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.575"},{"key":"ref238","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00639"},{"key":"ref235","article-title":"A multi-world approach to question answering about real-world scenes based on uncertain input","author":"malinowski","year":"0"},{"key":"ref234","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2019\/276"},{"key":"ref237","article-title":"Exploring models and data for image question answering","author":"ren","year":"0","journal-title":"Proc Int Conf Neural Inf Process"},{"key":"ref236","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.9"},{"key":"ref136","article-title":"OmniNet: A unified architecture for multi-modal multi-task learning","author":"pramanik","year":"2019","journal-title":"arXiv 1907 07804"},{"key":"ref135","article-title":"ViLBERT: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks","author":"lu","year":"0","journal-title":"Vancouver"},{"key":"ref138","doi-asserted-by":"publisher","DOI":"10.1145\/2993148.2993176"},{"key":"ref137","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1441"},{"key":"ref139","doi-asserted-by":"publisher","DOI":"10.1109\/ICME.2017.8019301"},{"key":"ref140","article-title":"Neural language modeling with visual features","author":"anastasopoulos","year":"2019","journal-title":"arXiv 1903 02930"},{"key":"ref141","first-page":"575","article-title":"CentralNet: A multilayer approach for multimodal fusion","author":"vielzeuf","year":"0","journal-title":"Proc Eur Conf Comput Vision"},{"key":"ref142","article-title":"Simple baseline for visual question answering","author":"zhou","year":"2015","journal-title":"arXiv 1512 02167"},{"key":"ref143","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00713"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1561\/2200000006"},{"key":"ref144","article-title":"Neural architecture search with reinforcement learning","author":"zoph","year":"0","journal-title":"Toulon"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1126\/science.1127647"},{"key":"ref145","first-page":"19","article-title":"Progressive neural architecture search","author":"liu","year":"0","journal-title":"Proc Eur Conf Comput Vision"},{"key":"ref241","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.475"},{"key":"ref242","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-018-1116-0"},{"key":"ref243","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2017\/179"},{"key":"ref244","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2017.2754246"},{"key":"ref240","article-title":"Visual dialogue","author":"das","year":"0"},{"key":"ref248","article-title":"Overcoming language priors in visual question answering with adversarial regularization","author":"ramakrishnan","year":"0","journal-title":"Montreal"},{"key":"ref247","article-title":"Dont just assume; Look and answer: Overcoming priors for visual question answering","author":"agrawal","year":"0"},{"key":"ref246","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00686"},{"key":"ref245","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00331"},{"key":"ref249","article-title":"RUBi: Reducing unimodal biases in visual question answering","author":"cadene","year":"0"},{"key":"ref109","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/P14-1068"},{"key":"ref108","article-title":"Multimodal learning with deep boltzmann machines","author":"salakhutdinov","year":"0"},{"key":"ref107","article-title":"Multimodal deep learning","author":"ngiam","year":"0","journal-title":"Bellevue"},{"key":"ref106","doi-asserted-by":"publisher","DOI":"10.1016\/j.csl.2019.06.001"},{"key":"ref105","article-title":"Secure binary embeddings of front-end factor analysis for privacy preserving speaker verification","author":"port\u00ealo","year":"0","journal-title":"Lyon"},{"key":"ref104","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683373"},{"key":"ref103","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1158"},{"key":"ref102","article-title":"A structured self-attentive sentence embedding","author":"lin","year":"0","journal-title":"Toulon"},{"key":"ref111","first-page":"136","article-title":"Distributional semantics in technicolor","author":"bruni","year":"0","journal-title":"Proc Assoc Comput Linguistics"},{"key":"ref112","article-title":"Visual Word2Vec (vis-w2v): Learning visually grounded word embeddings using abstract scenes","author":"kottur","year":"0","journal-title":"LAS VEGAS"},{"key":"ref110","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298754"},{"key":"ref250","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.244"},{"key":"ref251","article-title":"Learning to count objects in natural images for visual question answering","author":"zhang","year":"0"},{"key":"ref254","article-title":"Active learning for visual question answering: An empirical study","author":"lin","year":"2017","journal-title":"arXiv 1711 01732"},{"key":"ref255","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00009"},{"key":"ref252","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00851"},{"key":"ref253","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00380"},{"key":"ref257","article-title":"FiLM: Visual reasoning with a general conditioning layer","author":"perez","year":"0"},{"key":"ref256","article-title":"Deep Bayesian active learning for multiple correct outputs","author":"jedoui","year":"2019","journal-title":"arXiv 1912 01119"},{"key":"ref259","article-title":"Deep compositional question answering with neural module networks","author":"andreas","year":"0"},{"key":"ref10","first-page":"235","article-title":"Discriminative pre-training of deep nerual networks","volume":"9","author":"yu","year":"0","journal-title":"U S Patent"},{"key":"ref258","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00209"},{"key":"ref11","first-page":"4688","article-title":"Large-vocabulry continuous speech recognition with context-dependent DBN-HMMs","author":"dahl","year":"0","journal-title":"Proc Int Conf Acoust Speech Signal Process"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2013.6639345"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2011.2134090"},{"key":"ref14","article-title":"Conversational speech transcription using context-dependent deep neural networks","author":"seide","year":"0","journal-title":"presented at the Interspeech"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/MSP.2012.2205597"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2013.6639344"},{"key":"ref118","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01225-0_13"},{"key":"ref17","doi-asserted-by":"crossref","DOI":"10.1007\/978-1-4471-5779-3","author":"yu","year":"2015","journal-title":"Automatic Speech Recognition?A Deep Learning Approach"},{"key":"ref117","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00677"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/5.726791"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1145\/3065386"},{"key":"ref119","article-title":"Learning factorized multimodal representations","author":"tsai","year":"0","journal-title":"Vancouver"},{"key":"ref114","article-title":"Learning representations by maximizing mutual information across views","author":"bachman","year":"0","journal-title":"Vancouver"},{"key":"ref113","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.538"},{"key":"ref116","article-title":"Deep fragment embeddings for bidirectional image sentence mapping","author":"karpathy","year":"0","journal-title":"Montreal"},{"key":"ref115","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/N15-1016"},{"key":"ref120","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00752"},{"key":"ref121","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01074"},{"key":"ref122","article-title":"Zero-shot learning through cross-modal transfer","author":"socher","year":"0","journal-title":"IEEE Long Beach"},{"key":"ref123","article-title":"DeViSE: A deep visual-semantic embedding model","author":"frome","year":"0","journal-title":"IEEE Long Beach"},{"key":"ref260","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N16-1181"},{"key":"ref261","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.93"},{"key":"ref262","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.215"},{"key":"ref263","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.325"},{"key":"ref264","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01234-2_4"},{"key":"ref265","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00519"},{"key":"ref266","article-title":"Compositional attention networks for machine reasoning","author":"hudson","year":"0"}],"container-title":["IEEE Journal of Selected Topics in Signal Processing"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/4200690\/9126272\/09068414.pdf?arnumber=9068414","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,9,30]],"date-time":"2023-09-30T00:03:18Z","timestamp":1696032198000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9068414\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020,3]]},"references-count":274,"journal-issue":{"issue":"3"},"URL":"https:\/\/doi.org\/10.1109\/jstsp.2020.2987728","relation":{},"ISSN":["1932-4553","1941-0484"],"issn-type":[{"value":"1932-4553","type":"print"},{"value":"1941-0484","type":"electronic"}],"subject":[],"published":{"date-parts":[[2020,3]]}}}