{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T04:23:28Z","timestamp":1750220608329,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":25,"publisher":"ACM","license":[{"start":{"date-parts":[[2020,6,8]],"date-time":"2020-06-08T00:00:00Z","timestamp":1591574400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Beijing Natural Science Foundation","award":["4202033"],"award-info":[{"award-number":["4202033"]}]},{"DOI":"10.13039\/501100012659","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61672523"],"award-info":[{"award-number":["61672523"]}],"id":[{"id":"10.13039\/501100012659","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2020,6,8]]},"DOI":"10.1145\/3372278.3390697","type":"proceedings-article","created":{"date-parts":[[2020,6,2]],"date-time":"2020-06-02T04:35:27Z","timestamp":1591072527000},"page":"428-435","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":4,"title":["iCap: Interactive Image Captioning with Predictive Text"],"prefix":"10.1145","author":[{"given":"Zhengxiong","family":"Jia","sequence":"first","affiliation":[{"name":"Renmin University of China &amp; Visionary Intelligence Ltd., Beijing, China"}]},{"given":"Xirong","family":"Li","sequence":"additional","affiliation":[{"name":"Renmin University of China &amp; Visionary Intelligence Ltd., Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2020,6,8]]},"reference":[{"doi-asserted-by":"crossref","unstructured":"D. Acuna H. Ling A. Kar and S. Fidler. 2018. Efficient Interactive Annotation of Segmentation Datasets With Polygon-RNN+. In CVPR.  D. Acuna H. Ling A. Kar and S. Fidler. 2018. Efficient Interactive Annotation of Segmentation Datasets With Polygon-RNN+. In CVPR.","key":"e_1_3_2_1_1_1","DOI":"10.1109\/CVPR.2018.00096"},{"doi-asserted-by":"crossref","unstructured":"P. Anderson X. He C. Buehler D. Teney M. Johnson S. Gould and L. Zhang. 2018. Bottom-Up and Top-Down Attention for Image Captioning and Visual Question Answering. In CVPR.  P. Anderson X. He C. Buehler D. Teney M. Johnson S. Gould and L. Zhang. 2018. Bottom-Up and Top-Down Attention for Image Captioning and Visual Question Answering. In CVPR.","key":"e_1_3_2_1_2_1","DOI":"10.1109\/CVPR.2018.00636"},{"doi-asserted-by":"crossref","unstructured":"F. Cai and M. Rijke. 2016. A Survey of Query Auto Completion in Information Retrieval. Foundations and Trends in Information Retrieval (2016).  F. Cai and M. Rijke. 2016. A Survey of Query Auto Completion in Information Retrieval. Foundations and Trends in Information Retrieval (2016).","key":"e_1_3_2_1_3_1","DOI":"10.1561\/9781680832013"},{"doi-asserted-by":"crossref","unstructured":"L. Castrej K. Kundu R. Urtasun and S. Fidler. 2017. Annotating Object Instances with a Polygon-RNN. In CVPR.  L. Castrej K. Kundu R. Urtasun and S. Fidler. 2017. Annotating Object Instances with a Polygon-RNN. In CVPR.","key":"e_1_3_2_1_4_1","DOI":"10.1109\/CVPR.2017.477"},{"doi-asserted-by":"crossref","unstructured":"S. Chen T. Yao and Y.-G. Jiang. 2019. Deep Learning for Video Captioning: A Review. In IJCAI.  S. Chen T. Yao and Y.-G. Jiang. 2019. Deep Learning for Video Captioning: A Review. In IJCAI.","key":"e_1_3_2_1_5_1","DOI":"10.24963\/ijcai.2019\/877"},{"unstructured":"X. Chen H. Fang T.-Y. Lin R. Vedantam S. Gupta P. Doll\u00e1r and L. Zitnick. 2015. Microsoft COCO captions: Data collection and evaluation server. CoRR Vol. abs\/1504.00325 (2015).  X. Chen H. Fang T.-Y. Lin R. Vedantam S. Gupta P. Doll\u00e1r and L. Zitnick. 2015. Microsoft COCO captions: Data collection and evaluation server. CoRR Vol. abs\/1504.00325 (2015).","key":"e_1_3_2_1_6_1"},{"doi-asserted-by":"crossref","unstructured":"M. Cornia L. Baraldi and R. Cucchiara. 2019. Show Control and Tell: A Framework for Generating Controllable and Grounded Captions. In CVPR.  M. Cornia L. Baraldi and R. Cucchiara. 2019. Show Control and Tell: A Framework for Generating Controllable and Grounded Captions. In CVPR.","key":"e_1_3_2_1_7_1","DOI":"10.1109\/CVPR.2019.00850"},{"doi-asserted-by":"crossref","unstructured":"H. Ge Z. Yan K. Zhang M. Zhao and L. Sun. 2019. Exploring Overall Contextual Information for Image Captioning in Human-Like Cognitive Style. In ICCV.  H. Ge Z. Yan K. Zhang M. Zhao and L. Sun. 2019. Exploring Overall Contextual Information for Image Captioning in Human-Like Cognitive Style. In ICCV.","key":"e_1_3_2_1_8_1","DOI":"10.1109\/ICCV.2019.00184"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_9_1","DOI":"10.1162\/neco.1997.9.8.1735"},{"doi-asserted-by":"crossref","unstructured":"L. Huang W. Wang J. Chen and X.-Y. Wei. 2019. Attention on Attention for Image Captioning. In ICCV.  L. Huang W. Wang J. Chen and X.-Y. Wei. 2019. Attention on Attention for Image Captioning. In ICCV.","key":"e_1_3_2_1_10_1","DOI":"10.1109\/ICCV.2019.00473"},{"doi-asserted-by":"crossref","unstructured":"W. Lan X. Li and J. Dong. 2017. Fluency-Guided Cross-Lingual Image Captioning. In ACMMM.  W. Lan X. Li and J. Dong. 2017. Fluency-Guided Cross-Lingual Image Captioning. In ACMMM.","key":"e_1_3_2_1_11_1","DOI":"10.1145\/3123266.3123366"},{"key":"e_1_3_2_1_12_1","volume-title":"Insertions and Reversals. Soviet Physics Doklady","volume":"10","author":"Levenshtein V. I.","year":"1966","unstructured":"V. I. Levenshtein . 1966 . Binary Codes Capable of Correcting Deletions , Insertions and Reversals. Soviet Physics Doklady , Vol. 10 (Feb. 1966), 707. V. I. Levenshtein. 1966. Binary Codes Capable of Correcting Deletions, Insertions and Reversals. Soviet Physics Doklady, Vol. 10 (Feb. 1966), 707."},{"doi-asserted-by":"crossref","unstructured":"X. Li L. Gao X. Wang W. Liu X. Xu H.-T. Shen and J. Song. 2019 a. Learnable Aggregating Net with Diversity Learning for Video Question Answering. In ACMMM.  X. Li L. Gao X. Wang W. Liu X. Xu H.-T. Shen and J. Song. 2019 a. Learnable Aggregating Net with Diversity Learning for Video Question Answering. In ACMMM.","key":"e_1_3_2_1_13_1","DOI":"10.1145\/3343031.3350971"},{"doi-asserted-by":"crossref","unstructured":"X. Li W. Lan J. Dong and H. Liu. 2016. Adding Chinese Captions to Images.  X. Li W. Lan J. Dong and H. Liu. 2016. Adding Chinese Captions to Images.","key":"e_1_3_2_1_14_1","DOI":"10.1145\/2911996.2912049"},{"key":"e_1_3_2_1_15_1","volume-title":"2019 b. COCO-CN for Cross-Lingual Image Tagging, Captioning and Retrieval. T-MM","author":"Li X.","year":"2019","unstructured":"X. Li , C. Xu , X. Wang , W. Lan , Z. Jia , G. Yang , and J. Xu . 2019 b. COCO-CN for Cross-Lingual Image Tagging, Captioning and Retrieval. T-MM ( 2019 ). X. Li, C. Xu, X. Wang, W. Lan, Z. Jia, G. Yang, and J. Xu. 2019 b. COCO-CN for Cross-Lingual Image Tagging, Captioning and Retrieval. T-MM (2019)."},{"doi-asserted-by":"crossref","unstructured":"H. Ling J. Gao A. Kar W. Chen and S. Fidler. 2019. Fast Interactive Object Annotation With Curve-GCN. In CVPR.  H. Ling J. Gao A. Kar W. Chen and S. Fidler. 2019. Fast Interactive Object Annotation With Curve-GCN. In CVPR.","key":"e_1_3_2_1_16_1","DOI":"10.1109\/CVPR.2019.00540"},{"doi-asserted-by":"crossref","unstructured":"T. Luong H. Pham and C. D. Manning. 2015. Effective Approaches to Attention-based Neural Machine Translation. In EMNLP.  T. Luong H. Pham and C. D. Manning. 2015. Effective Approaches to Attention-based Neural Machine Translation. In EMNLP.","key":"e_1_3_2_1_17_1","DOI":"10.18653\/v1\/D15-1166"},{"doi-asserted-by":"crossref","unstructured":"P. Mettes D. Koelma and C. Snoek. 2016. The ImageNet Shuffle: Reorganized Pre-training for Video Event Detection. In ICMR.  P. Mettes D. Koelma and C. Snoek. 2016. The ImageNet Shuffle: Reorganized Pre-training for Video Event Detection. In ICMR.","key":"e_1_3_2_1_18_1","DOI":"10.1145\/2911996.2912036"},{"doi-asserted-by":"crossref","unstructured":"S. Rennie E. Marcheret Y. Mroueh J. Ross and V. Goel. 2017. Self-Critical Sequence Training for Image Captioning. In CVPR.  S. Rennie E. Marcheret Y. Mroueh J. Ross and V. Goel. 2017. Self-Critical Sequence Training for Image Captioning. In CVPR.","key":"e_1_3_2_1_19_1","DOI":"10.1109\/CVPR.2017.131"},{"key":"e_1_3_2_1_20_1","volume-title":"Twin Networks: Matching the Future for Sequence Generation. In ICLR.","author":"Serdyuk D.","year":"2018","unstructured":"D. Serdyuk , N. Rosemary Ke , A. Sordoni , A. Trischler , C. Pal , and Y. Bengio . 2018 . Twin Networks: Matching the Future for Sequence Generation. In ICLR. D. Serdyuk, N. Rosemary Ke, A. Sordoni, A. Trischler, C. Pal, and Y. Bengio. 2018. Twin Networks: Matching the Future for Sequence Generation. In ICLR."},{"doi-asserted-by":"crossref","unstructured":"O. Vinyals A. Toshev S. Bengio and D. Erhan. 2015. Show and Tell: A Neural Image Caption Generator. In CVPR.  O. Vinyals A. Toshev S. Bengio and D. Erhan. 2015. Show and Tell: A Neural Image Caption Generator. In CVPR.","key":"e_1_3_2_1_21_1","DOI":"10.1109\/CVPR.2015.7298935"},{"doi-asserted-by":"crossref","unstructured":"C. Wang H. Yang and C. Meinel. 2018. Image Captioning with Deep Bidirectional LSTMs and Multi-Task Learning. TOMM (2018).  C. Wang H. Yang and C. Meinel. 2018. Image Captioning with Deep Bidirectional LSTMs and Multi-Task Learning. TOMM (2018).","key":"e_1_3_2_1_22_1","DOI":"10.1145\/3115432"},{"doi-asserted-by":"crossref","unstructured":"A. Wu Y. Han and Y. Yang. 2019. Video Interactive Captioning with Human Prompts. In IJCAI.  A. Wu Y. Han and Y. Yang. 2019. Video Interactive Captioning with Human Prompts. In IJCAI.","key":"e_1_3_2_1_23_1","DOI":"10.24963\/ijcai.2019\/135"},{"doi-asserted-by":"crossref","unstructured":"J. Xu T. Mei T. Yao and Y. Rui. 2016. MSR-VTT: A Large Video Description Dataset for Bridging Video and Language. In CVPR.  J. Xu T. Mei T. Yao and Y. Rui. 2016. MSR-VTT: A Large Video Description Dataset for Bridging Video and Language. In CVPR.","key":"e_1_3_2_1_24_1","DOI":"10.1109\/CVPR.2016.571"},{"doi-asserted-by":"crossref","unstructured":"X. Zhang J. Su Y. Qin Y. Liu R. Ji and H. Wang. 2018. Asynchronous Bidirectional Decoding for Neural Machine Translation. In AAAI.  X. Zhang J. Su Y. Qin Y. Liu R. Ji and H. Wang. 2018. Asynchronous Bidirectional Decoding for Neural Machine Translation. In AAAI.","key":"e_1_3_2_1_25_1","DOI":"10.1609\/aaai.v32i1.11984"}],"event":{"sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"acronym":"ICMR '20","name":"ICMR '20: International Conference on Multimedia Retrieval","location":"Dublin Ireland"},"container-title":["Proceedings of the 2020 International Conference on Multimedia Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3372278.3390697","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3372278.3390697","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T21:32:10Z","timestamp":1750195930000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3372278.3390697"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020,6,8]]},"references-count":25,"alternative-id":["10.1145\/3372278.3390697","10.1145\/3372278"],"URL":"https:\/\/doi.org\/10.1145\/3372278.3390697","relation":{},"subject":[],"published":{"date-parts":[[2020,6,8]]},"assertion":[{"value":"2020-06-08","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}