{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,23]],"date-time":"2025-08-23T05:23:33Z","timestamp":1755926613152,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":35,"publisher":"ACM","license":[{"start":{"date-parts":[[2021,10,17]],"date-time":"2021-10-17T00:00:00Z","timestamp":1634428800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Beijing Natural Science Foundation","award":["4192028"],"award-info":[{"award-number":["4192028"]}]},{"name":"National Natural Science Foundation of China","award":["61772535"],"award-info":[{"award-number":["61772535"]}]},{"name":"National Key R&D Program of China","award":["2020AAA0108600"],"award-info":[{"award-number":["2020AAA0108600"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2021,10,17]]},"DOI":"10.1145\/3474085.3475452","type":"proceedings-article","created":{"date-parts":[[2021,10,18]],"date-time":"2021-10-18T05:40:18Z","timestamp":1634535618000},"page":"3097-3105","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":8,"title":["Question-controlled Text-aware Image Captioning"],"prefix":"10.1145","author":[{"given":"Anwen","family":"Hu","sequence":"first","affiliation":[{"name":"Renming University of China, Beijing, China"}]},{"given":"Shizhe","family":"Chen","sequence":"additional","affiliation":[{"name":"INRIA, Paris, France"}]},{"given":"Qin","family":"Jin","sequence":"additional","affiliation":[{"name":"Renmin University of China, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2021,10,17]]},"reference":[{"volume-title":"SPICE: Semantic Propositional Image Caption Evaluation. In ECCV (5) (Lecture Notes in Computer Science","year":"2016","author":"Anderson Peter","key":"e_1_3_2_2_1_1"},{"volume-title":"Bottom-Up and Top-Down Attention for Image Captioning and Visual Question Answering","author":"Anderson Peter","key":"e_1_3_2_2_2_1","doi-asserted-by":"crossref","DOI":"10.1109\/CVPR.2018.00636"},{"volume-title":"Schwing","year":"2019","author":"Aneja Jyoti","key":"e_1_3_2_2_3_1"},{"volume-title":"Marcal Rusinol, C. V. Jawahar, Ernest Valveny, and Dimosthenis Karatzas.","year":"2019","author":"Biten Ali Furkan","key":"e_1_3_2_2_4_1"},{"volume-title":"Say As You Wish: Fine-Grained Control of Image Caption Generation With Abstract Scene Graphs","author":"Chen Shizhe","key":"e_1_3_2_2_5_1"},{"volume-title":"Control and Tell: A Framework for Generating Controllable and Grounded Captions","author":"Cornia Marcella","key":"e_1_3_2_2_6_1"},{"volume-title":"Denkowski and Alon Lavie","year":"2014","author":"Michael","key":"e_1_3_2_2_7_1"},{"volume-title":"Forsyth","year":"2019","author":"Deshpande Aditya","key":"e_1_3_2_2_8_1"},{"volume-title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In NAACL-HLT (1)","year":"2019","author":"Devlin Jacob","key":"e_1_3_2_2_9_1"},{"volume-title":"EMNLP (1)","author":"Fisch Adam","key":"e_1_3_2_2_10_1"},{"volume-title":"ECCV (17) (Lecture Notes in Computer Science","author":"Gurari Danna","key":"e_1_3_2_2_11_1"},{"key":"e_1_3_2_2_12_1","doi-asserted-by":"publisher","DOI":"10.5555\/3454287.3455286"},{"volume-title":"Iterative Answer Prediction With Pointer-Augmented Multimodal Transformers for TextVQA","author":"Hu Ronghang","key":"e_1_3_2_2_13_1"},{"volume-title":"Attention on Attention for Image Captioning","author":"Huang Lun","key":"e_1_3_2_2_14_1","doi-asserted-by":"crossref","DOI":"10.1109\/ICCV.2019.00473"},{"volume-title":"ECCV (9) (Lecture Notes in Computer Science","author":"Kant Yash","key":"e_1_3_2_2_15_1"},{"volume-title":"Rouge: A package for automatic evaluation of summaries. In Text summarization branches out. 74--81.","year":"2004","author":"Lin Chin-Yew","key":"e_1_3_2_2_16_1"},{"volume-title":"Piotr Doll\u00e1 r, and C. Lawrence Zitnick","year":"2014","author":"Lin Tsung-Yi","key":"e_1_3_2_2_17_1"},{"volume-title":"Knowing When to Look: Adaptive Attention via a Visual Sentinel for Image Captioning","author":"Lu Jiasen","key":"e_1_3_2_2_18_1"},{"key":"e_1_3_2_2_19_1","unstructured":"Meredith Ringel Morris Jazette Johnson Cynthia L. Bennett and Edward Cutrell. 2018. Rich Representations of Visual Content for Screen Reader Users. In CHI. ACM 59.  Meredith Ringel Morris Jazette Johnson Cynthia L. Bennett and Edward Cutrell. 2018. Rich Representations of Visual Content for Screen Reader Users. In CHI. ACM 59."},{"key":"e_1_3_2_2_20_1","doi-asserted-by":"publisher","DOI":"10.3115\/1073083.1073135"},{"key":"e_1_3_2_2_21_1","first-page":"1","article-title":"2020. Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer","volume":"21","author":"Raffel Colin","year":"2020","journal-title":"J. Mach. Learn. Res."},{"volume-title":"100,000+ Questions for Machine Comprehension of Text","author":"Rajpurkar Pranav","key":"e_1_3_2_2_22_1"},{"key":"e_1_3_2_2_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.131"},{"volume-title":"ECCV (2) (Lecture Notes in Computer Science","author":"Sidorov Oleksii","key":"e_1_3_2_2_24_1"},{"volume-title":"Meet Shah, Yu Jiang, Xinlei Chen, Dhruv Batra, Devi Parikh, and Marcus Rohrbach.","year":"2019","author":"Singh Amanpreet","key":"e_1_3_2_2_25_1"},{"key":"e_1_3_2_2_26_1","doi-asserted-by":"publisher","DOI":"10.5555\/3295222.3295349"},{"volume-title":"CIDEr: Consensus-based image description evaluation","author":"Vedantam Ramakrishna","key":"e_1_3_2_2_27_1"},{"volume-title":"Show and tell: A neural image caption generator","author":"Vinyals Oriol","key":"e_1_3_2_2_28_1"},{"key":"e_1_3_2_2_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413753"},{"volume-title":"Chan","year":"2019","author":"Wang Qingzhong","key":"e_1_3_2_2_30_1"},{"volume-title":"Confidence-aware Non-repetitive Multimodal Transformers for TextCaps","author":"Wang Zhaokai","key":"e_1_3_2_2_31_1"},{"key":"e_1_3_2_2_32_1","doi-asserted-by":"publisher","DOI":"10.5555\/3045118.3045336"},{"volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 8751--8761","year":"2020","author":"Yang Zhengyuan","key":"e_1_3_2_2_33_1"},{"volume-title":"Intention Oriented Image Captions With Guiding Objects","author":"Zheng Yue","key":"e_1_3_2_2_34_1","doi-asserted-by":"crossref","DOI":"10.1109\/CVPR.2019.00859"},{"volume-title":"Simple is not Easy: A Simple Strong Baseline for TextVQA and TextCaps","author":"Zhu Qi","key":"e_1_3_2_2_35_1"}],"event":{"name":"MM '21: ACM Multimedia Conference","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Virtual Event China","acronym":"MM '21"},"container-title":["Proceedings of the 29th ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3474085.3475452","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3474085.3475452","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T20:48:33Z","timestamp":1750193313000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3474085.3475452"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,10,17]]},"references-count":35,"alternative-id":["10.1145\/3474085.3475452","10.1145\/3474085"],"URL":"https:\/\/doi.org\/10.1145\/3474085.3475452","relation":{},"subject":[],"published":{"date-parts":[[2021,10,17]]},"assertion":[{"value":"2021-10-17","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}