{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,9,7]],"date-time":"2024-09-07T13:28:33Z","timestamp":1725715713130},"reference-count":22,"publisher":"IEEE","license":[{"start":{"date-parts":[[2022,5,23]],"date-time":"2022-05-23T00:00:00Z","timestamp":1653264000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2022,5,23]],"date-time":"2022-05-23T00:00:00Z","timestamp":1653264000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022,5,23]]},"DOI":"10.1109\/icassp43922.2022.9746969","type":"proceedings-article","created":{"date-parts":[[2022,4,27]],"date-time":"2022-04-27T19:50:34Z","timestamp":1651089034000},"page":"1985-1989","source":"Crossref","is-referenced-by-count":1,"title":["Multi-Modal Learning with Text Merging for TEXTVQA"],"prefix":"10.1109","author":[{"given":"Changsheng","family":"Xu","sequence":"first","affiliation":[{"name":"Fudan University,School of Computer Science,Shanghai,China,200438"}]},{"given":"Zhenlong","family":"Xu","sequence":"additional","affiliation":[{"name":"Fudan University,School of Computer Science,Shanghai,China,200438"}]},{"given":"Yifan","family":"He","sequence":"additional","affiliation":[{"name":"Fudan University,School of Computer Science,Shanghai,China,200438"}]},{"given":"Shuigeng","family":"Zhou","sequence":"additional","affiliation":[{"name":"Fudan University,School of Computer Science,Shanghai,China,200438"}]},{"given":"Jihong","family":"Guan","sequence":"additional","affiliation":[{"name":"Tongji University,Dept. of Computer Sci. &amp; Techl,Shanghai,China,201804"}]}],"member":"263","reference":[{"key":"ref10","article-title":"Vilbert: Pretraining Task-agnostic Visiolinguistic Representations for Vision-and-language Tasks","author":"lu","year":"2019","journal-title":"NIPS"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00439"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/ICDAR.2019.00156"},{"key":"ref13","first-page":"1563","article-title":"Icdar 2019 Competition on Scene Text Visual Question Answering","author":"biten","year":"2019","journal-title":"ICDAR"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01001"},{"key":"ref15","first-page":"2953","article-title":"Exploring Models and Data for Image Question Answering","author":"ren","year":"2015","journal-title":"NIPS"},{"key":"ref16","first-page":"1682","article-title":"A Multi-World Approach to Question Answering About Real-World Scenes Based on Uncertain Input","author":"malinowski","year":"2014","journal-title":"NIPS"},{"key":"ref17","first-page":"2296","article-title":"Are You Talking to A Machine? Dataset and Methods for Multilingual Image Question Answering","author":"gao","year":"2015","journal-title":"NIPS"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.10"},{"key":"ref19","first-page":"4171","article-title":"Bert: Pre-training of Deep Bidirectional Transformers for Language Understanding","author":"devlin","year":"2019","journal-title":"NAACL"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00851"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/TITS.2005.858619"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.670"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.279"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00636"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.215"},{"key":"ref2","article-title":"System and Method for Text Translations and Annotation in An Instant Messaging Session","author":"cheung","year":"2008","journal-title":"US Patent"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP.2011.6116198"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.285"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2016.2577031"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2014.2339814"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00051"}],"event":{"name":"ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","start":{"date-parts":[[2022,5,23]]},"location":"Singapore, Singapore","end":{"date-parts":[[2022,5,27]]}},"container-title":["ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/9745891\/9746004\/09746969.pdf?arnumber=9746969","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,8,22]],"date-time":"2022-08-22T20:10:43Z","timestamp":1661199043000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9746969\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,5,23]]},"references-count":22,"URL":"https:\/\/doi.org\/10.1109\/icassp43922.2022.9746969","relation":{},"subject":[],"published":{"date-parts":[[2022,5,23]]}}}