{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,26]],"date-time":"2025-03-26T12:04:36Z","timestamp":1742990676385,"version":"3.40.3"},"publisher-location":"Singapore","reference-count":26,"publisher":"Springer Nature Singapore","isbn-type":[{"type":"print","value":"9789811982996"},{"type":"electronic","value":"9789811983009"}],"license":[{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022]]},"DOI":"10.1007\/978-981-19-8300-9_17","type":"book-chapter","created":{"date-parts":[[2022,12,1]],"date-time":"2022-12-01T14:22:49Z","timestamp":1669904569000},"page":"154-162","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Learning to\u00a0Answer Complex Visual Questions from\u00a0Multi-View Analysis"],"prefix":"10.1007","author":[{"given":"Minjun","family":"Zhu","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yixuan","family":"Weng","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shizhu","family":"He","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Kang","family":"Liu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jun","family":"Zhao","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2022,12,2]]},"reference":[{"key":"17_CR1","doi-asserted-by":"crossref","unstructured":"Antol, S., et al.: Visual question answering. In: International Conference on Computer Vision, VQA (2015)","DOI":"10.1109\/ICCV.2015.279"},{"key":"17_CR2","unstructured":"Chen, S.X., Liu, J.S.: Statistical applications of the poisson-binomial and conditional Bernoulli distributions. Statistica Sinica 7, 875\u2013892 (1997)"},{"key":"17_CR3","unstructured":"Devlin, J., Chang, M.-W., Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. In: Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, vol. 1 (Long and Short Papers) (Minneapolis, Minnesota, June 2019), Association for Computational Linguistics, pp. 4171\u20134186"},{"issue":"4","key":"17_CR4","doi-asserted-by":"publisher","first-page":"398","DOI":"10.1007\/s11263-018-1116-0","volume":"127","author":"Y Goyal","year":"2018","unstructured":"Goyal, Y., Khot, T., Agrawal, A., Summers-Stay, D., Batra, D., Parikh, D.: Making the V in VQA matter: elevating the role of image understanding in visual question answering. Int. J. Comput. Vis. 127(4), 398\u2013414 (2018). https:\/\/doi.org\/10.1007\/s11263-018-1116-0","journal-title":"Int. J. Comput. Vis."},{"key":"17_CR5","doi-asserted-by":"publisher","first-page":"225","DOI":"10.1016\/j.aiopen.2021.08.002","volume":"2","author":"X Han","year":"2021","unstructured":"Han, X., et al.: Pre-trained models: past, present and future. AI Open 2, 225\u2013250 (2021)","journal-title":"AI Open"},{"key":"17_CR6","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. arXiv:1512.03385 Computer Vision and Pattern Recognition (2015)","DOI":"10.1109\/CVPR.2016.90"},{"key":"17_CR7","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: 2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"17_CR8","doi-asserted-by":"crossref","unstructured":"Hu, R., Singh, A.: Unit: multimodal multitask learning with a unified transformer. In: International Conference on Computer Vision (2021)","DOI":"10.1109\/ICCV48922.2021.00147"},{"key":"17_CR9","unstructured":"Huang, Y., Lv, T., Cui, L., Lu, Y., Wei, F.: Layoutlmv3: pre-training for document AI with unified text and image masking"},{"key":"17_CR10","doi-asserted-by":"crossref","unstructured":"Li, B., Weng, Y., Sun, B., Li, S.: Towards visual-prompt temporal answering grounding in medical instructional video. arXiv preprint arXiv:2203.06667 (2022)","DOI":"10.36227\/techrxiv.22182736.v1"},{"key":"17_CR11","doi-asserted-by":"crossref","unstructured":"Li, W., et al.: UNIMO: towards unified-modal understanding and generation via cross-modal contrastive learning. In: Meeting of the Association for Computational Linguistics (2020)","DOI":"10.18653\/v1\/2021.acl-long.202"},{"key":"17_CR12","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization. In: International Conference on Learning Representations (2018)"},{"key":"17_CR13","unstructured":"Malinowski, M., Fritz, M.: A multi-world approach to question answering about real-world scenes based on uncertain input. In: Neural Information Processing Systems (2014)"},{"key":"17_CR14","unstructured":"Paszke, A., et al.: PyTorch: an imperative style, high-performance deep learning library. In: Wallach, H., Larochelle, H., Beygelzimer, A., d\u2019 Alch\u00e9-Buc, F., Fox, E., Garnett, R. (eds.) Advances in Neural Information Processing Systems, vol. 32, Curran Associates Inc. (2019)"},{"key":"17_CR15","doi-asserted-by":"crossref","unstructured":"Peters, M.E., et al.: Deep contextualized word representations. In: North American Chapter of the Association for Computational Linguistics (2018)","DOI":"10.18653\/v1\/N18-1202"},{"key":"17_CR16","unstructured":"Qi, D., Su, L., Song, J., Cui, E., Bharti, T., Sacheti, A.: ImageBERT: cross-modal pre-training with large-scale weak-supervised image-text data"},{"issue":"10","key":"17_CR17","doi-asserted-by":"publisher","first-page":"1872","DOI":"10.1007\/s11431-020-1647-3","volume":"63","author":"XP Qiu","year":"2020","unstructured":"Qiu, X.P., Sun, T.X., Xu, Y.G., Shao, Y.F., Dai, N., Huang, X.J.: Pre-trained models for natural language processing: a survey. Sci. China Technol. Sci. 63(10), 1872\u20131897 (2020). https:\/\/doi.org\/10.1007\/s11431-020-1647-3","journal-title":"Sci. China Technol. Sci."},{"key":"17_CR18","unstructured":"Radford, A., Narasimhan, K.: Improving language understanding by generative pre-training"},{"key":"17_CR19","unstructured":"Ren, M., Kiros, R., Zemel, R.S.: Exploring models and data for image question answering. In: Neural Information Processing Systems (2015)"},{"key":"17_CR20","first-page":"1929","volume":"15","author":"N Srivastava","year":"2014","unstructured":"Srivastava, N., Hinton, G.E., Krizhevsky, A., Sutskever, I., Salakhutdinov, R.: Dropout: a simple way to prevent neural networks from overfitting. J. Mach. Learn. Res. 15, 1929\u20131958 (2014)","journal-title":"J. Mach. Learn. Res."},{"key":"17_CR21","unstructured":"Wang, P., et al.: Unifying architectures, tasks, and modalities through a simple sequence-to-sequence learning framework"},{"key":"17_CR22","unstructured":"Wang, W., Bao, H., Dong, L., Wei, F.: VLMo: unified vision-language pre-training with mixture-of-modality-experts. arXiv: 2111.02358 Computer Vision and Pattern Recognition (2021)"},{"key":"17_CR23","unstructured":"Wolf, T., et al.: Transformers: state-of-the-art natural language processing. In: Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations Association for Computational Linguistics, pp. 38\u201345 (2020)"},{"key":"17_CR24","doi-asserted-by":"crossref","unstructured":"Xu, R., et al.: Raise a child in large language model: towards effective and generalizable fine-tuning. In: Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing, pp. 9514\u20139528 (2021)","DOI":"10.18653\/v1\/2021.emnlp-main.749"},{"key":"17_CR25","doi-asserted-by":"crossref","unstructured":"Xu, Y., Li, M., Cui, L., Huang, S., Wei, F., Zhou, M.: LayoutLM: pre-training of text and layout for document image understanding. knowledge discovery and data mining (2019)","DOI":"10.1145\/3394486.3403172"},{"key":"17_CR26","doi-asserted-by":"crossref","unstructured":"Xu, Y., et al.: LayoutLMv2: multi-modal pre-training for visually-rich document understanding. In: Meeting of the Association for Computational Linguistics (2020)","DOI":"10.18653\/v1\/2021.acl-long.201"}],"container-title":["Communications in Computer and Information Science","CCKS 2022 - Evaluation Track"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-19-8300-9_17","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,3,15]],"date-time":"2023-03-15T01:24:22Z","timestamp":1678843462000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-19-8300-9_17"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022]]},"ISBN":["9789811982996","9789811983009"],"references-count":26,"URL":"https:\/\/doi.org\/10.1007\/978-981-19-8300-9_17","relation":{},"ISSN":["1865-0929","1865-0937"],"issn-type":[{"type":"print","value":"1865-0929"},{"type":"electronic","value":"1865-0937"}],"subject":[],"published":{"date-parts":[[2022]]},"assertion":[{"value":"2 December 2022","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"CCKS","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China Conference on Knowledge Graph and Semantic Computing","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Qinhuangdao","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2022","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"24 August 2022","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27 August 2022","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"7","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"ccks2022","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/sigkg.cn\/ccks2022\/en\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Single-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Easychair","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"42","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"25","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"60% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"4.5","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"No","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Evaluation Track: 25 papers accepted","order":10,"name":"additional_info_on_review_process","label":"Additional Info on Review Process","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}