{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,26]],"date-time":"2025-03-26T15:28:30Z","timestamp":1743002910785,"version":"3.40.3"},"publisher-location":"Singapore","reference-count":31,"publisher":"Springer Singapore","isbn-type":[{"type":"print","value":"9789811905223"},{"type":"electronic","value":"9789811905230"}],"license":[{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022]]},"DOI":"10.1007\/978-981-19-0523-0_7","type":"book-chapter","created":{"date-parts":[[2022,2,28]],"date-time":"2022-02-28T06:02:45Z","timestamp":1646028165000},"page":"103-119","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Multi-modal Universal Embedding Representations for Language Understanding"],"prefix":"10.1007","author":[{"given":"Xi","family":"Luo","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chunjie","family":"Cao","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Longjuan","family":"Wang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2022,3,1]]},"reference":[{"key":"7_CR1","doi-asserted-by":"publisher","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: 2016 IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2016, Las Vegas, NV, USA, 27\u201330 June 2016, pp. 770\u2013778. IEEE Computer Society (2016). https:\/\/doi.org\/10.1109\/CVPR.2016.90","DOI":"10.1109\/CVPR.2016.90"},{"key":"7_CR2","unstructured":"Simonyan, K., Zisserman, A.: Very deep convolutional networks for large-scale image recognition. In: Bengio, Y., LeCun, Y. (eds.) 3rd International Conference on Learning Representations, ICLR 2015, San Diego, CA, USA, 7\u20139 May 2015, Conference Track Proceedings (2015). http:\/\/arxiv.org\/abs\/1409.1556"},{"key":"7_CR3","doi-asserted-by":"publisher","unstructured":"Devlin, J., Chang, M., Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. In: Burstein, J., Doran, C., Solorio, T. (eds.) Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, NAACL-HLT 2019, Minneapolis, MN, USA, June 2\u20137, 2019, Volume 1 (Long and Short Papers), pp. 4171\u20134186. Association for Computational Linguistics (2019). https:\/\/doi.org\/10.18653\/v1\/n19-1423","DOI":"10.18653\/v1\/n19-1423"},{"key":"7_CR4","unstructured":"Radford, A., Narasimhan, K., Salimans, T., Sutskever, I.: Improving language understanding by generative pre-training (2018)"},{"key":"7_CR5","unstructured":"Liu, Y., et al.: Roberta: a robustly optimized BERT pretraining approach. CoRR abs\/1907.11692 (2019). http:\/\/arxiv.org\/abs\/1907.11692"},{"key":"7_CR6","unstructured":"Lu, J., Batra, D., Parikh, D., Lee, S.: ViLBERT: pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. In: Wallach, H.M., Larochelle, H., Beygelzimer, A., d\u2019Alch\u00e9-Buc, F., Fox, E.B., Garnett, R. (eds.) Advances in Neural Information Processing Systems 32: Annual Conference on Neural Information Processing Systems 2019, NeurIPS 2019, Vancouver, BC, Canada, 8\u201314 December 2019, pp. 13\u201323 (2019). https:\/\/proceedings.neurips.cc\/paper\/2019\/hash\/c74d97b01eae257e44aa9d5bade97baf-Abstract.html"},{"key":"7_CR7","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"104","DOI":"10.1007\/978-3-030-58577-8_7","volume-title":"Computer Vision \u2013 ECCV 2020","author":"Y-C Chen","year":"2020","unstructured":"Chen, Y.-C., et al.: UNITER: UNiversal image-TExt representation learning. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020, Part XXX. LNCS, vol. 12375, pp. 104\u2013120. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58577-8_7"},{"key":"7_CR8","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"649","DOI":"10.1007\/978-3-319-46487-9_40","volume-title":"Computer Vision \u2013 ECCV 2016","author":"R Zhang","year":"2016","unstructured":"Zhang, R., Isola, P., Efros, A.A.: Colorful image colorization. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016, Part III. LNCS, vol. 9907, pp. 649\u2013666. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46487-9_40"},{"key":"7_CR9","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"69","DOI":"10.1007\/978-3-319-46466-4_5","volume-title":"Computer Vision \u2013 ECCV 2016","author":"M Noroozi","year":"2016","unstructured":"Noroozi, M., Favaro, P.: Unsupervised learning of visual representations by solving Jigsaw puzzles. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016, Part VI. LNCS, vol. 9910, pp. 69\u201384. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46466-4_5"},{"key":"7_CR10","doi-asserted-by":"publisher","unstructured":"Pathak, D., Kr\u00e4henb\u00fchl, P., Donahue, J., Darrell, T., Efros, A.A.: Context encoders: feature learning by inpainting. In: 2016 IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2016, Las Vegas, NV, USA, 27\u201330 June 2016, pp. 2536\u20132544. IEEE Computer Society (2016). https:\/\/doi.org\/10.1109\/CVPR.2016.278","DOI":"10.1109\/CVPR.2016.278"},{"key":"7_CR11","unstructured":"Gidaris, S., Singh, P., Komodakis, N.: Unsupervised representation learning by predicting image rotations. In: 6th International Conference on Learning Representations, ICLR 2018, Vancouver, BC, Canada, April 30\u2013May 3 2018, Conference Track Proceedings. OpenReview.net (2018). https:\/\/openreview.net\/forum?id=S1v4N2l0"},{"key":"7_CR12","doi-asserted-by":"publisher","unstructured":"Doersch, C., Gupta, A., Efros, A.A.: Unsupervised visual representation learning by context prediction. In: 2015 IEEE International Conference on Computer Vision, ICCV 2015, Santiago, Chile, 7\u201313 December 2015, pp. 1422\u20131430. IEEE Computer Society (2015). https:\/\/doi.org\/10.1109\/ICCV.2015.167","DOI":"10.1109\/ICCV.2015.167"},{"key":"7_CR13","doi-asserted-by":"publisher","unstructured":"Peters, M.E., et al.: Deep contextualized word representations. In: Walker, M.A., Ji, H., Stent, A. (eds.) Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, NAACL-HLT 2018, New Orleans, Louisiana, USA, 1\u20136 June 2018, Volume 1 (Long Papers), pp. 2227\u20132237. Association for Computational Linguistics (2018). https:\/\/doi.org\/10.18653\/v1\/n18-1202","DOI":"10.18653\/v1\/n18-1202"},{"issue":"8","key":"7_CR14","first-page":"9","volume":"1","author":"A Radford","year":"2019","unstructured":"Radford, A., et al.: Language models are unsupervised multitask learners. OpenAI blog 1(8), 9 (2019)","journal-title":"OpenAI blog"},{"key":"7_CR15","unstructured":"Yang, Z., Dai, Z., Yang, Y., Carbonell, J.G., Salakhutdinov, R., Le, Q.V.: XLNet: generalized autoregressive pretraining for language understanding. In: Wallach, H.M., Larochelle, H., Beygelzimer, A., d\u2019Alch\u00e9-Buc, F., Fox, E.B., Garnett, R. (eds.) Advances in Neural Information Processing Systems 32: Annual Conference on Neural Information Processing Systems 2019, NeurIPS 2019, Vancouver, BC, Canada, 8\u201314 December 2019, pp. 5754\u20135764 (2019). https:\/\/proceedings.neurips.cc\/paper\/2019\/hash\/dc6a7e655d7e5840e66733e9ee67cc69-Abstract.html"},{"key":"7_CR16","unstructured":"Vaswani, A., et al.: Attention is all you need. In: Guyon, I., et al. (eds.) Advances in Neural Information Processing Systems 30: Annual Conference on Neural Information Processing Systems 2017, Long Beach, CA, USA, 4\u20139 December 2017, pp. 5998\u20136008 (2017). https:\/\/proceedings.neurips.cc\/paper\/2017\/hash\/3f5ee243547dee91fbd053c1c4a845aa-Abstract.html"},{"key":"7_CR17","doi-asserted-by":"publisher","unstructured":"Chen, X., Cheng, Y., Wang, S., Gan, Z., Wang, Z., Liu, J.: EarlyBERT: efficient BERT training via early-bird lottery tickets. In: Zong, C., Xia, F., Li, W., Navigli, R. (eds.) Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing, ACL\/IJCNLP 2021, (Volume 1: Long Papers), Virtual Event, 1\u20136 August 2021, pp. 2195\u20132207. Association for Computational Linguistics (2021). https:\/\/doi.org\/10.18653\/v1\/2021.acl-long.171","DOI":"10.18653\/v1\/2021.acl-long.171"},{"key":"7_CR18","unstructured":"Wang, B., et al.: InfoBERT: improving robustness of language models from an information theoretic perspective. In: 9th International Conference on Learning Representations, ICLR 2021, Virtual Event, Austria, 3\u20137 May 2021. OpenReview.net (2021). https:\/\/openreview.net\/forum?id=hpH98mK5Puk"},{"key":"7_CR19","doi-asserted-by":"publisher","unstructured":"Sun, Z., et al.: ChineseBERT: Chinese pretraining enhanced by glyph and pinyin information. In: Zong, C., Xia, F., Li, W., Navigli, R. (eds.) Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing, ACL\/IJCNLP 2021, (Volume 1: Long Papers), Virtual Event, 1\u20136 August 2021, pp. 2065\u20132075. Association for Computational Linguistics (2021). https:\/\/doi.org\/10.18653\/v1\/2021.acl-long.161","DOI":"10.18653\/v1\/2021.acl-long.161"},{"key":"7_CR20","unstructured":"Sun, L., Wang, J., Zhang, K., Su, Y., Weng, F.: RPBERT: a text-image relation propagation-based BERT model for multimodal NER. In: Thirty-Fifth AAAI Conference on Artificial Intelligence, AAAI 2021, Thirty-Third Conference on Innovative Applications of Artificial Intelligence, IAAI 2021, The Eleventh Symposium on Educational Advances in Artificial Intelligence, EAAI 2021, Virtual Event, 2\u20139 February 2021, pp. 13860\u201313868. AAAI Press (2021). https:\/\/ojs.aaai.org\/index.php\/AAAI\/article\/view\/17633"},{"key":"7_CR21","doi-asserted-by":"publisher","unstructured":"Sun, C., Myers, A., Vondrick, C., Murphy, K., Schmid, C.: VideoBERT: a joint model for video and language representation learning. In: 2019 IEEE\/CVF International Conference on Computer Vision, ICCV 2019, Seoul, Korea (South), October 27\u2013November 2 2019, pp. 7463\u20137472. IEEE (2019). https:\/\/doi.org\/10.1109\/ICCV.2019.00756","DOI":"10.1109\/ICCV.2019.00756"},{"key":"7_CR22","doi-asserted-by":"publisher","unstructured":"Tan, H., Bansal, M.: LXMERT: learning cross-modality encoder representations from transformers. In: Inui, K., Jiang, J., Ng, V., Wan, X. (eds.) Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing, EMNLP-IJCNLP 2019, Hong Kong, China, 3\u20137 November 2019, pp. 5099\u20135110. Association for Computational Linguistics (2019). https:\/\/doi.org\/10.18653\/v1\/D19-1514","DOI":"10.18653\/v1\/D19-1514"},{"key":"7_CR23","unstructured":"Li, L.H., Yatskar, M., Yin, D., Hsieh, C., Chang, K.: VisualBERT: a simple and performant baseline for vision and language. CoRR abs\/1908.03557 (2019). http:\/\/arxiv.org\/abs\/1908.03557"},{"key":"7_CR24","doi-asserted-by":"crossref","unstructured":"Li, G., Duan, N., Fang, Y., Gong, M., Jiang, D.: Unicoder-VL: a universal encoder for vision and language by cross-modal pre-training. In: The Thirty-Fourth AAAI Conference on Artificial Intelligence, AAAI 2020, The Thirty-Second Innovative Applications of Artificial Intelligence Conference, IAAI 2020, The Tenth AAAI Symposium on Educational Advances in Artificial Intelligence, EAAI 2020, New York, NY, USA, 7\u201312 February 2020, pp. 11336\u201311344. AAAI Press (2020). https:\/\/aaai.org\/ojs\/index.php\/AAAI\/article\/view\/6795","DOI":"10.1609\/aaai.v34i07.6795"},{"key":"7_CR25","unstructured":"Su, W., et al.: VL-BERT: pre-training of generic visual-linguistic representations. In: 8th International Conference on Learning Representations, ICLR 2020, Addis Ababa, Ethiopia, 26\u201330 April 2020. OpenReview.net (2020). https:\/\/openreview.net\/forum?id=SygXPaEYvH"},{"key":"7_CR26","doi-asserted-by":"publisher","unstructured":"Alberti, C., Ling, J., Collins, M., Reitter, D.: Fusion of detected objects in text for visual question answering. In: Inui, K., Jiang, J., Ng, V., Wan, X. (eds.) Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing, EMNLP-IJCNLP 2019, Hong Kong, China, 3\u20137 November 2019, pp. 2131\u20132140. Association for Computational Linguistics (2019). https:\/\/doi.org\/10.18653\/v1\/D19-1219","DOI":"10.18653\/v1\/D19-1219"},{"key":"7_CR27","doi-asserted-by":"publisher","unstructured":"Antol, S., et al.: VQA: visual question answering. In: 2015 IEEE International Conference on Computer Vision, ICCV 2015, Santiago, Chile, 7\u201313 December 2015, pp. 2425\u20132433. IEEE Computer Society (2015). https:\/\/doi.org\/10.1109\/ICCV.2015.279","DOI":"10.1109\/ICCV.2015.279"},{"key":"7_CR28","doi-asserted-by":"publisher","unstructured":"Hudson, D.A., Manning, C.D.: GQA: a new dataset for real-world visual reasoning and compositional question answering. In: IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2019, Long Beach, CA, USA, 16\u201320 June 2019, pp. 6700\u20136709. Computer Vision Foundation\/IEEE (2019). https:\/\/doi.org\/10.1109\/CVPR.2019.00686. http:\/\/openaccess.thecvf.com\/content_CVPR_2019\/html\/Hudson_GQA_A_New_Dataset_for_Real-World_Visual_Reasoning_and_Compositional_CVPR_2019_paper.html","DOI":"10.1109\/CVPR.2019.00686"},{"key":"7_CR29","doi-asserted-by":"publisher","unstructured":"Anderson, P., et al.: Bottom-up and top-down attention for image captioning and visual question answering. In: 2018 IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2018, Salt Lake City, UT, USA, 18\u201322 June 2018, pp. 6077\u20136086. IEEE Computer Society (2018). https:\/\/doi.org\/10.1109\/CVPR.2018.00636. http:\/\/openaccess.thecvf.com\/content_cvpr_2018\/html\/Anderson_Bottom-Up_and_Top-Down_CVPR_2018_paper.html","DOI":"10.1109\/CVPR.2018.00636"},{"key":"7_CR30","unstructured":"Ba, L.J., Kiros, J.R., Hinton, G.E.: Layer normalization. CoRR abs\/1607.06450 (2016). http:\/\/arxiv.org\/abs\/1607.06450"},{"key":"7_CR31","unstructured":"Wu, Y., et al.: Google\u2019s neural machine translation system: bridging the gap between human and machine translation. CoRR abs\/1609.08144 (2016). http:\/\/arxiv.org\/abs\/1609.08144"}],"container-title":["Communications in Computer and Information Science","Frontiers in Cyber Security"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-19-0523-0_7","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,2,28]],"date-time":"2022-02-28T06:04:49Z","timestamp":1646028289000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-19-0523-0_7"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022]]},"ISBN":["9789811905223","9789811905230"],"references-count":31,"URL":"https:\/\/doi.org\/10.1007\/978-981-19-0523-0_7","relation":{},"ISSN":["1865-0929","1865-0937"],"issn-type":[{"type":"print","value":"1865-0929"},{"type":"electronic","value":"1865-0937"}],"subject":[],"published":{"date-parts":[[2022]]},"assertion":[{"value":"1 March 2022","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"FCS","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Frontiers in Cyber Security","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Haikou","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2021","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"17 December 2021","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"19 December 2021","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"fcs2021","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/www.icfcs.com.cn\/fcs2021","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"EasyChair","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"58","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"20","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"2","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"34% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"2.7","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"4.3","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}