{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,5,16]],"date-time":"2025-05-16T09:09:05Z","timestamp":1747386545998},"reference-count":26,"publisher":"IEEE","license":[{"start":{"date-parts":[[2022,7,18]],"date-time":"2022-07-18T00:00:00Z","timestamp":1658102400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2022,7,18]],"date-time":"2022-07-18T00:00:00Z","timestamp":1658102400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022,7,18]]},"DOI":"10.1109\/icme52920.2022.9859865","type":"proceedings-article","created":{"date-parts":[[2022,8,26]],"date-time":"2022-08-26T19:45:18Z","timestamp":1661543118000},"page":"1-6","source":"Crossref","is-referenced-by-count":4,"title":["Modality-Specific Multimodal Global Enhanced Network for Text-Based Visual Question Answering"],"prefix":"10.1109","author":[{"given":"Zhi","family":"Yang","sequence":"first","affiliation":[{"name":"School of Computer Science &#x0026; Engineering, South China University of Technology,Guangzhou,China"}]},{"given":"Jun","family":"Xuan","sequence":"additional","affiliation":[{"name":"School of Computer Science &#x0026; Engineering, South China University of Technology,Guangzhou,China"}]},{"given":"Qing","family":"Liu","sequence":"additional","affiliation":[{"name":"School of Computer Science &#x0026; Engineering, South China University of Technology,Guangzhou,China"}]},{"given":"Aihua","family":"Mao","sequence":"additional","affiliation":[{"name":"School of Computer Science &#x0026; Engineering, South China University of Technology,Guangzhou,China"}]}],"member":"263","reference":[{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413924"},{"key":"ref11","first-page":"3608","article-title":"Simple is not easy: A simple strong baseline for textvqa and textcaps","author":"zhu","year":"0","journal-title":"AAAI"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00864"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475425"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475606"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.279"},{"key":"ref16","first-page":"2692","article-title":"Pointer networks","author":"vinyals","year":"0","journal-title":"NIPS"},{"key":"ref17","first-page":"4171","article-title":"Bert: Pretraining of deep bidirectional transformers for language understanding","author":"devlin","year":"0","journal-title":"NAACL"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2016.2577031"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2019\/423"},{"key":"ref4","first-page":"4101","article-title":"Check it again: Progressive visual question answering via visual entailment","author":"si","year":"0","journal-title":"ACL"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01028"},{"key":"ref6","first-page":"5998","article-title":"Attention is all you need","author":"vaswani","year":"0","journal-title":"NIPS"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00851"},{"key":"ref8","first-page":"715","article-title":"Spatially aware multimodal transformers for textvqa","author":"kant","year":"0","journal-title":"ECCV"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01001"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00644"},{"key":"ref9","article-title":"Structured multimodal attentions for textvqa","author":"gao","year":"2020","journal-title":"ArXiv Preprint"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00636"},{"key":"ref20","article-title":"A simple and robust convolutional-attention network for irregular text recognition","author":"wang","year":"2019","journal-title":"ArXiv Preprint"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2014.2339814"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/E17-2068"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00439"},{"key":"ref23","first-page":"289","article-title":"Hierarchical co-attention for visual question answering","author":"lu","year":"0","journal-title":"NIPS"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.coling-main.278"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-020-01316-z"}],"event":{"name":"2022 IEEE International Conference on Multimedia and Expo (ICME)","start":{"date-parts":[[2022,7,18]]},"location":"Taipei, Taiwan","end":{"date-parts":[[2022,7,22]]}},"container-title":["2022 IEEE International Conference on Multimedia and Expo (ICME)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/9859562\/9858923\/09859865.pdf?arnumber=9859865","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,9,19]],"date-time":"2022-09-19T20:23:38Z","timestamp":1663619018000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9859865\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,7,18]]},"references-count":26,"URL":"https:\/\/doi.org\/10.1109\/icme52920.2022.9859865","relation":{},"subject":[],"published":{"date-parts":[[2022,7,18]]}}}