{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,29]],"date-time":"2026-04-29T21:24:22Z","timestamp":1777497862543,"version":"3.51.4"},"publisher-location":"Singapore","reference-count":127,"publisher":"Springer Nature Singapore","isbn-type":[{"value":"9789819777068","type":"print"},{"value":"9789819777075","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024]]},"DOI":"10.1007\/978-981-97-7707-5_29","type":"book-chapter","created":{"date-parts":[[2024,9,16]],"date-time":"2024-09-16T15:04:02Z","timestamp":1726499042000},"page":"331-363","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":3,"title":["The Journey of\u00a0Language Models in\u00a0Understanding Natural Language"],"prefix":"10.1007","author":[{"given":"Yuanrui","family":"Liu","sequence":"first","affiliation":[]},{"given":"Jingping","family":"Zhou","sequence":"additional","affiliation":[]},{"given":"Guobiao","family":"Sang","sequence":"additional","affiliation":[]},{"given":"Ruilong","family":"Huang","sequence":"additional","affiliation":[]},{"given":"Xinzhe","family":"Zhao","sequence":"additional","affiliation":[]},{"given":"Jintao","family":"Fang","sequence":"additional","affiliation":[]},{"given":"Tiexin","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Bohan","family":"Li","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,9,11]]},"reference":[{"key":"29_CR1","doi-asserted-by":"publisher","first-page":"225","DOI":"10.1016\/j.aiopen.2021.08.002","volume":"2","author":"X Han","year":"2021","unstructured":"Han, X., et al.: Pre-trained models: Past, present and future. AI Open 2, 225\u2013250 (2021)","journal-title":"AI Open"},{"issue":"3","key":"29_CR2","doi-asserted-by":"publisher","first-page":"22","DOI":"10.3390\/bdcc2030022","volume":"2","author":"J Ray","year":"2018","unstructured":"Ray, J., Johnny, O., Trovati, M., Sotiriadis, S., Bessis, N.: The rise of big data science: a survey of techniques, methods and approaches in the field of natural language processing and network theory. Big Data Cogn. Comput. 2(3), 22 (2018)","journal-title":"Big Data Cogn. Comput."},{"key":"29_CR3","doi-asserted-by":"publisher","first-page":"6705","DOI":"10.1007\/s00500-018-3181-2","volume":"22","author":"J Liu","year":"2018","unstructured":"Liu, J., Lin, L., Ren, H., Gu, M., Wang, J., Youn, G., Kim, J.-U.: Building neural network language model with POS-based negative sampling and stochastic conjugate gradient descent. Soft. Comput. 22, 6705\u20136717 (2018)","journal-title":"Soft. Comput."},{"key":"29_CR4","unstructured":"Vaswani, A., et al.: Attention is all you need. In: Advances in Neural Information Processing Systems, vol. 30 (2017)"},{"key":"29_CR5","unstructured":"Devlin, J., Chang, M.-W., Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)"},{"issue":"1","key":"29_CR6","first-page":"5485","volume":"21","author":"C Raffel","year":"2020","unstructured":"Raffel, C., et al.: Exploring the limits of transfer learning with a unified text-to-text transformer. J. Mach. Learn. Res. 21(1), 5485\u20135551 (2020)","journal-title":"J. Mach. Learn. Res."},{"key":"29_CR7","doi-asserted-by":"crossref","unstructured":"Bendersky, M., Croft, W.B.: Modeling higher-order term dependencies in information retrieval using query hypergraphs. In: Proceedings of the 35th International ACM SIGIR Conference on Research and Development in Information Retrieval, pp. 941\u2013950 (2012)","DOI":"10.1145\/2348283.2348408"},{"key":"29_CR8","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-14771-6","volume-title":"Syntactic n-Grams in Computational Linguistics","author":"G Sidorov","year":"2019","unstructured":"Sidorov, G.: Syntactic n-Grams in Computational Linguistics. Springer, Heidelberg (2019). https:\/\/doi.org\/10.1007\/978-3-030-14771-6"},{"key":"29_CR9","doi-asserted-by":"crossref","unstructured":"Mikolov, T., Karafi\u00e1t, M., Burget, L., Cernock\u1ef3, J., Khudanpur, S.: Recurrent neural network based language model. In: Interspeech, vol. 2, pp. 1045\u20131048. Makuhari (2010)","DOI":"10.21437\/Interspeech.2010-343"},{"key":"29_CR10","unstructured":"Waswani, A., et al.: Attention is all you need. In: NIPS (2017)"},{"key":"29_CR11","unstructured":"Devlin, J., Kenton, M.-W.C., Toutanova, L.K.: Bert: Pre-training of deep bidirectional transformers for language understanding. In: Proceedings of naacL-HLT, vol. 1, p. 2 (2019)"},{"key":"29_CR12","unstructured":"Roberts, A., et al.: Exploring the limits of transfer learning with a unified text-to-text transformer (2019)"},{"key":"29_CR13","unstructured":"Brown, T., et al.: Language models are few-shot learners. In: Advances in Neural Information Processing Systems, vol. 33, pp. 1877\u20131901 (2020)"},{"key":"29_CR14","doi-asserted-by":"crossref","unstructured":"Granados, A.: Analysis and study on text representation to improve the accuracy of the normalized compression distance. arXiv preprint arXiv:1205.6376 (2012)","DOI":"10.3233\/AIC-2012-0529"},{"issue":"8","key":"29_CR15","doi-asserted-by":"publisher","first-page":"817","DOI":"10.1002\/asi.24167","volume":"70","author":"\u00cdC Dourado","year":"2019","unstructured":"Dourado, \u00cd.C., Galante, R., Gon\u00e7alves, M.A., da Silva Torres, R.: Bag of textual graphs (BoTG): a general graph-based text representation model. J. Assoc. Inf. Sci. Technol. 70(8), 817\u2013829 (2019)","journal-title":"J. Assoc. Inf. Sci. Technol."},{"issue":"6","key":"29_CR16","doi-asserted-by":"publisher","first-page":"486","DOI":"10.1177\/004051750207200604","volume":"72","author":"J-J Lin","year":"2002","unstructured":"Lin, J.-J.: Applying a co-occurrence matrix to automatic inspection of weaving density for woven fabrics. Text. Res. J. 72(6), 486\u2013490 (2002)","journal-title":"Text. Res. J."},{"issue":"1","key":"29_CR17","doi-asserted-by":"publisher","first-page":"39","DOI":"10.1016\/0304-3800(95)00142-5","volume":"90","author":"S Lek","year":"1996","unstructured":"Lek, S., Delacoste, M., Baran, P., Dimopoulos, I., Lauga, J., Aulagnier, S.: Application of neural networks to modelling nonlinear relationships in ecology. Ecol. Model. 90(1), 39\u201352 (1996)","journal-title":"Ecol. Model."},{"key":"29_CR18","unstructured":"Ding, Z., Qiu, X., Zhang, Q., Huang, X.: Learning topical translation model for microblog hashtag suggestion. In: Twenty-Third International Joint Conference on Artificial Intelligence (2013)"},{"key":"29_CR19","unstructured":"Mikolov, T., Sutskever, I., Chen, K., Corrado, G.S., Dean, J.: Distributed representations of words and phrases and their compositionality. In: Advances in Neural Information Processing Systems, vol. 26 (2013)"},{"key":"29_CR20","volume-title":"Neural Networks And Deep Learning","author":"MA Nielsen","year":"2015","unstructured":"Nielsen, M.A.: Neural Networks And Deep Learning, vol. 25. Determination Press, San Francisco (2015)"},{"issue":"11","key":"29_CR21","doi-asserted-by":"publisher","first-page":"613","DOI":"10.1145\/361219.361220","volume":"18","author":"G Salton","year":"1975","unstructured":"Salton, G., Wong, A., Yang, C.-S.: A vector space model for automatic indexing. Commun. ACM 18(11), 613\u2013620 (1975)","journal-title":"Commun. ACM"},{"key":"29_CR22","doi-asserted-by":"crossref","unstructured":"Wong, S.K.M., Ziarko, W., Wong, P.C.N.: Generalized vector spaces model in information retrieval. In: Proceedings of the 8th Annual International ACM SIGIR Conference on Research and Development in Information Retrieval, pp. 18\u201325 (1985)","DOI":"10.1145\/253495.253506"},{"issue":"1","key":"29_CR23","doi-asserted-by":"publisher","first-page":"37","DOI":"10.1017\/S1351324910000185","volume":"17","author":"PA Chew","year":"2011","unstructured":"Chew, P.A., Bader, B.W., Helmreich, S., Abdelali, A., Verzi, S.J.: An information-theoretic, vector-space-model approach to cross-language information retrieval. Nat. Lang. Eng. 17(1), 37\u201370 (2011)","journal-title":"Nat. Lang. Eng."},{"key":"29_CR24","doi-asserted-by":"crossref","unstructured":"Tsatsaronis, G., Panagiotopoulou, V.: A generalized vector space model for text retrieval based on semantic relatedness. In: 2009 Proceedings of the Student Research Workshop at EACL, pp. 70\u201378 (2009)","DOI":"10.3115\/1609179.1609188"},{"key":"29_CR25","first-page":"251","volume":"2","author":"RF Dong","year":"2019","unstructured":"Dong, R.F., Liu, C.A., Yang, G.T.: TF-IDF based loop closure detection algorithm for SLAM. J. Southeast Univ. 2, 251\u2013258 (2019)","journal-title":"J. Southeast Univ."},{"key":"29_CR26","doi-asserted-by":"publisher","first-page":"761","DOI":"10.1016\/j.procs.2017.08.166","volume":"112","author":"M Hajjem","year":"2017","unstructured":"Hajjem, M., Latiri, C.: Combining IR and LDA topic modeling for filtering microblogs. Procedia Comput. Sci. 112, 761\u2013770 (2017)","journal-title":"Procedia Comput. Sci."},{"key":"29_CR27","unstructured":"Liu, Z., Huang, W., Zheng, Y., Sun, M.: Automatic keyphrase extraction via topic decomposition. In: Proceedings of the 2010 conference on empirical methods in natural language processing, pp. 366\u2013376 (2010)"},{"key":"29_CR28","unstructured":"Li, Y., Liu, T., Jiang, J., Zhang, L.: Hashtag recommendation with topical attention-based LSTM. In: Proceedings of COLING 2016, the 26th International Conference on Computational Linguistics: Technical Papers, pp. 3019\u20133029 (2016)"},{"issue":"Jan","key":"29_CR29","first-page":"993","volume":"3","author":"DM Blei","year":"2003","unstructured":"Blei, D.M., Ng, A.Y., Jordan, M.I.: Latent Dirichlet allocation. J. Mach. Learn. Res. 3(Jan), 993\u20131022 (2003)","journal-title":"J. Mach. Learn. Res."},{"key":"29_CR30","doi-asserted-by":"crossref","unstructured":"Pu, X., Jin, R., Wu, G., Han, D., Xue, G.R.: Topic modeling in semantic space with keywords. In: Proceedings of the 24th ACM International on Conference on Information and Knowledge Management, pp. 1141\u20131150 (2015)","DOI":"10.1145\/2806416.2806584"},{"issue":"1","key":"29_CR31","doi-asserted-by":"publisher","first-page":"210","DOI":"10.1016\/j.csl.2013.05.002","volume":"28","author":"M Siu","year":"2014","unstructured":"Siu, M., Gish, H., Chan, A., Belfield, W., Lowe, S.: Unsupervised training of an hmm-based self-organizing unit recognizer with applications to topic classification and keyword discovery. Comput. Speech Lang. 28(1), 210\u2013223 (2014)","journal-title":"Comput. Speech Lang."},{"key":"29_CR32","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"935","DOI":"10.1007\/978-3-540-44871-6_108","volume-title":"Pattern Recognition and Image Analysis","author":"A Schenker","year":"2003","unstructured":"Schenker, A., Last, M., Bunke, H., Kandel, A.: Graph representations for web document clustering. In: Perales, F.J., Campilho, A.J.C., de la Blanca, N.P., Sanfeliu, A. (eds.) IbPRIA 2003. LNCS, vol. 2652, pp. 935\u2013942. Springer, Heidelberg (2003). https:\/\/doi.org\/10.1007\/978-3-540-44871-6_108"},{"key":"29_CR33","doi-asserted-by":"crossref","unstructured":"Sonawane, S.S., Kulkarni, P.A.: Graph based representation and analysis of text document: a survey of techniques. Int. J. Comput. Appl. 96(19) (2014)","DOI":"10.5120\/16899-6972"},{"key":"29_CR34","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"368","DOI":"10.1007\/978-3-030-24274-9_33","volume-title":"Artificial Intelligence and Security","author":"Y Chen","year":"2019","unstructured":"Chen, Y., Lu, H., Qiu, J., Wang, L.: A tutorial of graph representation. In: Sun, X., Pan, Z., Bertino, E. (eds.) ICAIS 2019, Part I. LNCS, vol. 11632, pp. 368\u2013378. Springer, Cham (2019). https:\/\/doi.org\/10.1007\/978-3-030-24274-9_33"},{"key":"29_CR35","unstructured":"Allen, J.: Natural Language Understanding. Benjamin-Cummings Publishing Co., Inc. (1995)"},{"key":"29_CR36","unstructured":"Jing, K., Xu, J.: A survey on neural network language models. arXiv preprint arXiv:1906.03591 (2019)"},{"key":"29_CR37","doi-asserted-by":"crossref","unstructured":"Liu, H., Zhang, Y., Wang, Y., Lin, Z., Chen, Y.: Joint character-level word embedding and adversarial stability training to defend adversarial text. In: Proceedings of the AAAI Conference on Artificial Intelligence , vol. 34, pp. 8384\u20138391 (2020)","DOI":"10.1609\/aaai.v34i05.6356"},{"key":"29_CR38","doi-asserted-by":"crossref","unstructured":"Bengtson, E., Roth, D.: Understanding the value of features for coreference resolution. In: Proceedings of the 2008 Conference on Empirical Methods in Natural Language Processing, pp. 294\u2013303 (2008)","DOI":"10.3115\/1613715.1613756"},{"key":"29_CR39","unstructured":"Hinton, G.E., et\u00a0al.: Learning distributed representations of concepts. In: Proceedings of the Eighth Annual Conference of the Cognitive Science Society, vol. 1, p. 12. Amherst, MA (1986)"},{"key":"29_CR40","unstructured":"Levy, O., Goldberg, Y.: Neural word embedding as implicit matrix factorization. In: Advances in Neural Information Processing Systems, vol. 27 (2014)"},{"key":"29_CR41","unstructured":"Bengio, Y., Ducharme, R., Vincent, P.: A neural probabilistic language model. Advances in Neural Information Processing Systems, vol. 13 (2000)"},{"key":"29_CR42","doi-asserted-by":"publisher","first-page":"21","DOI":"10.1016\/j.imavis.2018.04.004","volume":"75","author":"P Rodr\u00edguez","year":"2018","unstructured":"Rodr\u00edguez, P., Bautista, M.A., Gonzalez, J., Escalera, S.: Beyond one-hot encoding: lower dimensional target embedding. Image Vis. Comput. 75, 21\u201331 (2018)","journal-title":"Image Vis. Comput."},{"issue":"4","key":"29_CR43","doi-asserted-by":"publisher","first-page":"713","DOI":"10.1109\/TNN.2007.912312","volume":"19","author":"Y Bengio","year":"2008","unstructured":"Bengio, Y., Sen\u00e9cal, J.-S.: Adaptive importance sampling to accelerate training of a neural probabilistic language model. IEEE Trans. Neural Netw. 19(4), 713\u2013722 (2008)","journal-title":"IEEE Trans. Neural Netw."},{"key":"29_CR44","doi-asserted-by":"crossref","unstructured":"Schwenk, H., Gauvain, J.-L.: Training neural network language models on very large corpora. In Proceedings of Human Language Technology Conference and Conference on Empirical Methods in Natural Language Processing, pp. 201\u2013208 (2005)","DOI":"10.3115\/1220575.1220601"},{"issue":"3","key":"29_CR45","doi-asserted-by":"publisher","first-page":"492","DOI":"10.1016\/j.csl.2006.09.003","volume":"21","author":"H Schwenk","year":"2007","unstructured":"Schwenk, H.: Continuous space language models. Comput. Speech Lang. 21(3), 492\u2013518 (2007)","journal-title":"Comput. Speech Lang."},{"key":"29_CR46","unstructured":"Arisoy, E., Sainath, T.N., Kingsbury, B., Ramabhadran, B.: Deep neural network language models. In: Proceedings of the NAACL-HLT 2012 Workshop: Will We Ever Really Replace the N-gram Model? On the Future of Language Modeling for HLT, pp. 20\u201328 (2012)"},{"key":"29_CR47","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1007\/s13278-020-0626-2","volume":"10","author":"N Idrissi","year":"2020","unstructured":"Idrissi, N., Zellou, A.: A systematic literature review of sparsity issues in recommender systems. Soc. Netw. Anal. Min. 10, 1\u201323 (2020)","journal-title":"Soc. Netw. Anal. Min."},{"key":"29_CR48","doi-asserted-by":"crossref","unstructured":"Kombrink, S., Mikolov, T., Karafi\u00e1t, M., Burget, L.: Recurrent neural network based language modeling in meeting recognition. In: Interspeech, vol. 11, pp. 2877\u20132880 (2011)","DOI":"10.21437\/Interspeech.2011-720"},{"key":"29_CR49","doi-asserted-by":"crossref","unstructured":"Mikolov, T., Kombrink, S., Burget, L., \u010cernock\u1ef3, J., Khudanpur, S.: Extensions of recurrent neural network language model. In: 2011 IEEE international conference on acoustics, speech and signal processing (ICASSP), pp. 5528\u20135531. IEEE (2011)","DOI":"10.1109\/ICASSP.2011.5947611"},{"key":"29_CR50","doi-asserted-by":"crossref","unstructured":"Chen, X., Ragni, A., Liu, X., Gales, M.J.F.: Investigating bidirectional recurrent neural network language models for speech recognition. In: Proceedings of Interspeech 2017, pp. 269\u2013273. International Speech Communication Association (ISCA) (2017)","DOI":"10.21437\/Interspeech.2017-513"},{"key":"29_CR51","doi-asserted-by":"crossref","unstructured":"Sundermeyer, M., Schl\u00fcter, R., Ney, H.: LSTM neural networks for language modeling. In: Thirteenth Annual Conference of the International Speech Communication Association (2012)","DOI":"10.21437\/Interspeech.2012-65"},{"key":"29_CR52","unstructured":"Yang, Z., Dai, Z., Salakhutdinov, R., Cohen, W.W.: Breaking the softmax bottleneck: a high-rank RNN language model. arXiv preprint arXiv:1711.03953 (2017)"},{"issue":"392","key":"29_CR53","doi-asserted-by":"publisher","first-page":"863","DOI":"10.1080\/01621459.1985.10478195","volume":"80","author":"JD Kalbfleisch","year":"1985","unstructured":"Kalbfleisch, J.D., Lawless, J.F.: The analysis of panel data under a Markov assumption. J. Am. Stat. Assoc. 80(392), 863\u2013871 (1985)","journal-title":"J. Am. Stat. Assoc."},{"issue":"02","key":"29_CR54","doi-asserted-by":"publisher","first-page":"107","DOI":"10.1142\/S0218488598000094","volume":"6","author":"S Hochreiter","year":"1998","unstructured":"Hochreiter, S.: The vanishing gradient problem during learning recurrent neural nets and problem solutions. Int. J. Uncertain. Fuzziness Knowl.-Based Syst. 6(02), 107\u2013116 (1998)","journal-title":"Int. J. Uncertain. Fuzziness Knowl.-Based Syst."},{"key":"29_CR55","unstructured":"Bahdanau, D., Cho, K., Bengio, Y.: Neural machine translation by jointly learning to align and translate. arXiv preprint arXiv:1409.0473 (2014)"},{"key":"29_CR56","doi-asserted-by":"publisher","first-page":"999","DOI":"10.1007\/s00521-017-3065-x","volume":"31","author":"H Deng","year":"2019","unstructured":"Deng, H., Zhang, L., Wang, L.: Global context-dependent recurrent neural network language model with sparse feature learning. Neural Comput. Appl. 31, 999\u20131011 (2019)","journal-title":"Neural Comput. Appl."},{"key":"29_CR57","unstructured":"Edelman, B.L., Goel, S., Kakade, S., Zhang, C.: Inductive biases and variable creation in self-attention mechanisms. In: International Conference on Machine Learning, pp. 5793\u20135831. PMLR (2022)"},{"key":"29_CR58","doi-asserted-by":"crossref","unstructured":"Subakan, C., Ravanelli, M., Cornell, S., Bronzi, M., Zhong, J.: Attention is all you need in speech separation. In: ICASSP 2021-2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 21\u201325. IEEE (2021)","DOI":"10.1109\/ICASSP39728.2021.9413901"},{"key":"29_CR59","unstructured":"Wang, W., et al.: StructBERT: incorporating language structures into pre-training for deep language understanding. arXiv preprint arXiv:1908.04577 (2019)"},{"key":"29_CR60","unstructured":"Liu, Y., et al.: RoBERTa: a robustly optimized BERT pretraining approach. arXiv preprint arXiv:1907.11692 (2019)"},{"key":"29_CR61","unstructured":"Sanh, V., Debut, L., Chaumond, J., Wolf, T.: DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter. arXiv preprint arXiv:1910.01108 (2019)"},{"key":"29_CR62","unstructured":"Lan, Z., Chen, M., Goodman, S., Gimpel, K., Sharma, P., Soricut, R.: ALBERT: a lite BERT for self-supervised learning of language representations. arXiv preprint arXiv:1909.11942 (2019)"},{"key":"29_CR63","doi-asserted-by":"publisher","first-page":"176","DOI":"10.1162\/tacl_a_00360","volume":"9","author":"X Wang","year":"2021","unstructured":"Wang, X., et al.: KEPLER: a unified model for knowledge embedding and pre-trained language representation. Trans. Assoc. Comput. Linguist. 9, 176\u2013194 (2021)","journal-title":"Trans. Assoc. Comput. Linguist."},{"key":"29_CR64","doi-asserted-by":"crossref","unstructured":"Jiao, X., et al.: TinyBERT: distilling BERT for natural language understanding. arXiv preprint arXiv:1909.10351 (2019)","DOI":"10.18653\/v1\/2020.findings-emnlp.372"},{"key":"29_CR65","doi-asserted-by":"crossref","unstructured":"Zhang, Z., Han, X., Liu, Z., Jiang, X., Sun, M., Liu, Q.: ERNIE: enhanced language representation with informative entities. arXiv preprint arXiv:1905.07129 (2019)","DOI":"10.18653\/v1\/P19-1139"},{"key":"29_CR66","unstructured":"Peters, M.E., et al.: Knowledge enhanced contextual word representations. arXiv preprint arXiv:1909.04164 (2019)"},{"issue":"8","key":"29_CR67","first-page":"9","volume":"1","author":"A Radford","year":"2019","unstructured":"Radford, A., et al.: Language models are unsupervised multitask learners. OpenAI blog 1(8), 9 (2019)","journal-title":"OpenAI blog"},{"key":"29_CR68","doi-asserted-by":"crossref","unstructured":"Gao, T., Fisch, A., Chen, D.: Making pre-trained language models better few-shot learners. arXiv preprint arXiv:2012.15723 (2020)","DOI":"10.18653\/v1\/2021.acl-long.295"},{"key":"29_CR69","unstructured":"Keskar, N.S., McCann, B., Varshney, L.R., Xiong, C., Socher, R: CTRL: a conditional transformer language model for controllable generation. arXiv preprint arXiv:1909.05858 (2019)"},{"key":"29_CR70","doi-asserted-by":"crossref","unstructured":"Dai, Z., Yang, Z., Yang, Y., Carbonell, J., Le, Q.V., Salakhutdinov, R.: Transformer-XL: attentive language models beyond a fixed-length context. arXiv preprint arXiv:1901.02860 (2019)","DOI":"10.18653\/v1\/P19-1285"},{"key":"29_CR71","unstructured":"Kitaev, N., Kaiser, \u0141., Levskaya, A.: Reformer: the efficient transformer. arXiv preprint arXiv:2001.04451 (2020)"},{"key":"29_CR72","unstructured":"OpenAI, R.: GPT-4 technical report. arXiv:2303.08774 (2023). View in Article, 2(5)"},{"key":"29_CR73","unstructured":"Ganguli, D., et\u00a0al. Red teaming language models to reduce harms: methods, scaling behaviors, and lessons learned. arXiv preprint arXiv:2209.07858 (2022)"},{"key":"29_CR74","unstructured":"Touvron, H., et\u00a0al. LLaMA: open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)"},{"key":"29_CR75","unstructured":"Touvron, H., et\u00a0al. LLaMA 2: open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288 (2023)"},{"key":"29_CR76","unstructured":"Costa-juss\u00e0, M.R., et\u00a0al.: No language left behind: Scaling human-centered machine translation. arXiv preprint arXiv:2207.04672 (2022)"},{"key":"29_CR77","unstructured":"Almazrouei, E., et\u00a0al.: Falcon-40B: an open large language model with state-of-the-art performance (2023). https:\/\/falconllmtii.ae. 2022"},{"key":"29_CR78","doi-asserted-by":"crossref","unstructured":"Xue, L., et al.: mT5: a massively multilingual pre-trained text-to-text transformer. arXiv preprint arXiv:2010.11934 (2020)","DOI":"10.18653\/v1\/2021.naacl-main.41"},{"key":"29_CR79","unstructured":"Zeng, W., et\u00a0al.: PanGu-$$\\alpha $$: large-scale autoregressive pretrained chinese language models with auto-parallel computation. arXiv preprint arXiv:2104.12369 (2021)"},{"key":"29_CR80","unstructured":"Sanh, V., et\u00a0al.: Multitask prompted training enables zero-shot task generalization. arXiv preprint arXiv:2110.08207 (2021)"},{"key":"29_CR81","doi-asserted-by":"crossref","unstructured":"Black, S., et\u00a0al.: GPT-NeoX-20B: An open-source autoregressive language model. arXiv preprint arXiv:2204.06745 (2022)","DOI":"10.18653\/v1\/2022.bigscience-1.9"},{"key":"29_CR82","unstructured":"Nijkamp, E., et al.: CodeGen: an open large language model for code with multi-turn program synthesis. arXiv preprint arXiv:2203.13474 (2022)"},{"key":"29_CR83","unstructured":"Tay, Y., et\u00a0al.: UL2: unifying language learning paradigms. arXiv preprint arXiv:2205.05131 (2022)"},{"key":"29_CR84","doi-asserted-by":"crossref","unstructured":"Muennighoff, N., et\u00a0al.: Crosslingual generalization through multitask finetuning. arXiv preprint arXiv:2211.01786 (2022)","DOI":"10.18653\/v1\/2023.acl-long.891"},{"key":"29_CR85","unstructured":"Li, R., et\u00a0al.: StarCoder: may the source be with you! arXiv preprint arXiv:2305.06161 (2023)"},{"key":"29_CR86","doi-asserted-by":"publisher","unstructured":"Huawei Technologies\u00a0Co., Ltd.: Huawei MindSpore AI development framework. In: Huawei Technologies\u00a0Co., Ltd. (eds.) Artificial Intelligence Technology, pp. 137\u2013162. Springer, Singapore (2022). https:\/\/doi.org\/10.1007\/978-981-19-2879-6_5","DOI":"10.1007\/978-981-19-2879-6_5"},{"key":"29_CR87","doi-asserted-by":"crossref","unstructured":"Lewis, M., et al.: BART: denoising sequence-to-sequence pre-training for natural language generation, translation, and comprehension. arXiv preprint arXiv:1910.13461 (2019)","DOI":"10.18653\/v1\/2020.acl-main.703"},{"key":"29_CR88","unstructured":"Zhang, J., Zhao, Y., Saleh, M., Liu, P.: PEGASUS: pre-training with extracted gap-sentences for abstractive summarization. In: International Conference on Machine Learning, pp. 11328\u201311339. PMLR (2020)"},{"key":"29_CR89","unstructured":"Shazeer, N., et al.: Outrageously large neural networks: the sparsely-gated mixture-of-experts layer. arXiv preprint arXiv:1701.06538 (2017)"},{"key":"29_CR90","doi-asserted-by":"crossref","unstructured":"Du, Z., et al.: GLM: general language model pretraining with autoregressive blank infilling. arXiv preprint arXiv:2103.10360 (2021)","DOI":"10.18653\/v1\/2022.acl-long.26"},{"key":"29_CR91","unstructured":"Zoph, B., et al.: ST-MoE: designing stable and transferable sparse expert models. arXiv preprint arXiv:2202.08906 (2022)"},{"key":"29_CR92","unstructured":"Soltan, S., et\u00a0al.: AlexaTM 20B: few-shot learning using a large-scale multilingual seq2seq model. arXiv preprint arXiv:2208.01448 (2022)"},{"key":"29_CR93","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning. arXiv preprint arXiv:2304.08485 (2023)"},{"key":"29_CR94","unstructured":"Griffith, S., Subramanian, K., Scholz, J., Isbell, C.L., Thomaz, A.L.: Policy shaping: integrating human feedback with reinforcement learning. In: Advances in Neural Information Processing Systems, vol. 26 (2013)"},{"key":"29_CR95","doi-asserted-by":"crossref","unstructured":"Gao, T., Fisch, A., Chen, D.: Making pre-trained language models better few-shot learners. In: Zong, C., Xia, F., Li, W., Navigli, R., (eds.) Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers), pp. 3816\u20133830. Association for Computational Linguistics (2021)","DOI":"10.18653\/v1\/2021.acl-long.295"},{"key":"29_CR96","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"331","DOI":"10.1007\/978-981-99-6222-8_28","volume-title":"Web Information Systems and Applications","author":"Z Li","year":"2023","unstructured":"Li, Z., Song, M., Zhu, Y., Zhang, L.: Chinese nested named entity recognition based on boundary prompt. In: Yuan, L., Yang, S., Li, R., Kanoulas, E., Zhao, X. (eds.) WISA 2023. LNCS, vol. 14094, pp. 331\u2013343. Springer, Singapore (2023). https:\/\/doi.org\/10.1007\/978-981-99-6222-8_28"},{"key":"29_CR97","doi-asserted-by":"crossref","unstructured":"Hao, Y., Mendelsohn, S., Sterneck, R., Martinez, R., Frank, R.: Probabilistic predictions of people perusing: evaluating metrics of language model performance for psycholinguistic modeling. arXiv preprint arXiv:2009.03954 (2020)","DOI":"10.18653\/v1\/2020.cmcl-1.10"},{"key":"29_CR98","unstructured":"Mnih, A., Teh, Y.W.: A fast and simple algorithm for training neural probabilistic language models. arXiv preprint arXiv:1206.6426 (2012)"},{"key":"29_CR99","doi-asserted-by":"crossref","unstructured":"Papineni, K., Roukos, S., Ward, T., Zhu, W.-J.: BLEU: a method for automatic evaluation of machine translation. In: Proceedings of the 40th Annual Meeting of the Association for Computational Linguistics, pp. 311\u2013318 (2002)","DOI":"10.3115\/1073083.1073135"},{"key":"29_CR100","doi-asserted-by":"crossref","unstructured":"Wang, A., Singh, A., Michael, J., Hill, F., Levy, O., Bowman, S.R.: GLUE: a multi-task benchmark and analysis platform for natural language understanding. arXiv preprint arXiv:1804.07461 (2018)","DOI":"10.18653\/v1\/W18-5446"},{"key":"29_CR101","unstructured":"Wang, A., et al.: SuperGLUE: a stickier benchmark for general-purpose language understanding systems. In: Advances in Neural Information Processing Systems, vol. 32 (2019)"},{"key":"29_CR102","unstructured":"Tenney, I., et\u00a0al.: What do you learn from context? Probing for sentence structure in contextualized word representations. arXiv preprint arXiv:1905.06316 (2019)"},{"key":"29_CR103","doi-asserted-by":"crossref","unstructured":"Giulianelli, M., Harding, J., Mohnert, F., Hupkes, D., Zuidema, W.: Under the hood: using diagnostic classifiers to investigate and improve how language models track agreement information. arXiv preprint arXiv:1808.08079 (2018)","DOI":"10.18653\/v1\/W18-5426"},{"key":"29_CR104","doi-asserted-by":"crossref","unstructured":"Tenney, I., Das, D., Pavlick, E.: BERT rediscovers the classical NLP pipeline. arXiv preprint arXiv:1905.05950 (2019)","DOI":"10.18653\/v1\/P19-1452"},{"key":"29_CR105","unstructured":"Kim, T., Choi, J., Edmiston, D., Lee, S.: Are pre-trained language models aware of phrases? Simple but strong baselines for grammar induction. arXiv preprint arXiv:2002.00737 (2020)"},{"key":"29_CR106","unstructured":"Hewitt, J., Manning, C.D.: A structural probe for finding syntax in word representations. In: Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers), pp. 4129\u20134138 (2019)"},{"key":"29_CR107","doi-asserted-by":"publisher","first-page":"842","DOI":"10.1162\/tacl_a_00349","volume":"8","author":"A Rogers","year":"2021","unstructured":"Rogers, A., Kovaleva, O., Rumshisky, A.: A primer in bertology: what we know about how BERT works. Trans. Assoc. Comput. Linguist. 8, 842\u2013866 (2021)","journal-title":"Trans. Assoc. Comput. Linguist."},{"key":"29_CR108","unstructured":"Saunshi, N., Malladi, S., Arora, S.: A mathematical exploration of why language models help solve downstream tasks. arXiv preprint arXiv:2010.03648 (2020)"},{"key":"29_CR109","unstructured":"Wei, C., Xie, S.M., Ma, T.: Why do pretrained language models help in downstream tasks? an analysis of head and prompt tuning. In: Advances in Neural Information Processing Systems, vol. 34, pp. 16158\u201316170 (2021)"},{"key":"29_CR110","doi-asserted-by":"crossref","unstructured":"Fahad, N.M., Sakib, S., Raiaan, M.A.K., Mukta, M.S.H.: SkinNet-8: an efficient CNN architecture for classifying skin cancer on an imbalanced dataset. In: 2023 International Conference on Electrical, Computer and Communication Engineering (ECCE), pp. 1\u20136. IEEE (2023)","DOI":"10.1109\/ECCE57851.2023.10101527"},{"key":"29_CR111","unstructured":"Zhu, X., Li, J., Liu, Y., Ma, C., Wang, W.: A survey on model compression for large language models. arXiv preprint arXiv:2308.07633 (2023)"},{"issue":"1","key":"29_CR112","doi-asserted-by":"publisher","first-page":"3","DOI":"10.1007\/s11127-023-01097-2","volume":"198","author":"F Motoki","year":"2024","unstructured":"Motoki, F., Neto, V.P., Rodrigues, V.: More human than human: measuring ChatGPT political bias. Public Choice 198(1), 3\u201323 (2024)","journal-title":"Public Choice"},{"key":"29_CR113","doi-asserted-by":"publisher","first-page":"15","DOI":"10.1007\/978-3-030-72188-6_2","volume-title":"Humanity Driven AI","author":"L Zhu","year":"2022","unstructured":"Zhu, L., Xu, X., Lu, Q., Governatori, G., Whittle, J.: AI and\u00a0ethics\u2014operationalizing responsible AI. In: Chen, F., Zhou, J. (eds.) Humanity Driven AI, pp. 15\u201333. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-030-72188-6_2"},{"key":"29_CR114","unstructured":"Meng, Y., Michalski, M., Huang, J., Zhang, Y., Abdelzaher, T., Han, J.: Tuning language models as training data generators for augmentation-enhanced few-shot learning. In: International Conference on Machine Learning, pp. 24457\u201324477. PMLR (2023)"},{"key":"29_CR115","doi-asserted-by":"publisher","first-page":"107540","DOI":"10.1016\/j.chb.2022.107540","volume":"139","author":"I Molenaar","year":"2023","unstructured":"Molenaar, I., de Mooij, S., Azevedo, R., Bannert, M., J\u00e4rvel\u00e4, S., Ga\u0161evi\u0107, D.: Measuring self-regulated learning and the role of AI: five years of research using multimodal multichannel data. Comput. Hum. Behav. 139, 107540 (2023)","journal-title":"Comput. Hum. Behav."},{"key":"29_CR116","doi-asserted-by":"crossref","unstructured":"Azevedo, R., Ga\u0161evi\u0107, D.: Analyzing multimodal multichannel data about self-regulated learning with advanced learning technologies: issues and challenges (2019)","DOI":"10.1016\/j.chb.2019.03.025"},{"key":"29_CR117","doi-asserted-by":"crossref","unstructured":"He, C., et al.: UltraEval: a lightweight platform for flexible and comprehensive evaluation for LLMs. arXiv preprint arXiv:2404.07584 (2024)","DOI":"10.18653\/v1\/2024.acl-demos.23"},{"issue":"2","key":"29_CR118","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3503488","volume":"13","author":"K Werder","year":"2022","unstructured":"Werder, K., Ramesh, B., Zhang, R.: Establishing data provenance for responsible artificial intelligence systems. ACM Trans. Manage. Inf. Syst. (TMIS) 13(2), 1\u201323 (2022)","journal-title":"ACM Trans. Manage. Inf. Syst. (TMIS)"},{"key":"29_CR119","doi-asserted-by":"crossref","unstructured":"Iqbal, U., Kohno, T., Roesner, F.: LLM platform security: applying a systematic evaluation framework to OpenAI\u2019s ChatGPT plugins. arXiv preprint arXiv:2309.10254 (2023)","DOI":"10.1609\/aies.v7i1.31664"},{"key":"29_CR120","doi-asserted-by":"crossref","unstructured":"Jiang, J., Liu, X., Fan, C.: Low-parameter federated learning with large language models. arXiv preprint arXiv:2307.13896 (2023)","DOI":"10.1007\/978-981-97-7707-5_28"},{"key":"29_CR121","doi-asserted-by":"crossref","unstructured":"Sun, S., Cheng, Y., Gan, Z., Liu, J.: Patient knowledge distillation for BERT model compression. arXiv preprint arXiv:1908.09355 (2019)","DOI":"10.18653\/v1\/D19-1441"},{"key":"29_CR122","doi-asserted-by":"crossref","unstructured":"Dietz, L., Xiong, C., Dalton, J., Meij, E.: The second workshop on knowledge graphs and semantics for text retrieval, analysis, and understanding (KG4IR). In: The 41st International ACM SIGIR Conference on Research & Development in Information Retrieval, pp. 1423\u20131426 (2018)","DOI":"10.1145\/3209978.3210196"},{"key":"29_CR123","doi-asserted-by":"crossref","unstructured":"Yang, Y., Huang, C., Xia, L., Li, C.: Knowledge graph contrastive learning for recommendation. In: Proceedings of the 45th International ACM SIGIR Conference on Research and Development in Information Retrieval, pp. 1434\u20131443 (2022)","DOI":"10.1145\/3477495.3532009"},{"key":"29_CR124","unstructured":"Zhang, Z., Zhang, A., Li, M., Zhao, H., Karypis, G., Smola, A.: Multimodal chain-of-thought reasoning in language models. arXiv preprint arXiv:2302.00923 (2023)"},{"key":"29_CR125","unstructured":"Zheng, G., Yang, B., Tang, J., Zhou, H.-Y., Yang, S.: DDCoT: duty-distinct chain-of-thought prompting for multimodal reasoning in language models. In: Advances in Neural Information Processing Systems, vol. 36, pp. 5168\u20135191 (2023)"},{"key":"29_CR126","unstructured":"Pan, L., et al.: Learn to explain: multimodal reasoning via thought chains for science question answering. In: Advances in Neural Information Processing Systems, vol. 35, pp. 2507\u20132521 (2022)"},{"key":"29_CR127","unstructured":"Liu, Z., Zhang, Y., Li, P., Liu, Y., Yang, D.: Dynamic LLM-agent network: an LLM-agent collaboration framework with agent team optimization. arXiv preprint arXiv:2310.02170 (2023)"}],"container-title":["Lecture Notes in Computer Science","Web Information Systems and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-97-7707-5_29","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,28]],"date-time":"2024-11-28T06:35:47Z","timestamp":1732775747000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-97-7707-5_29"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024]]},"ISBN":["9789819777068","9789819777075"],"references-count":127,"URL":"https:\/\/doi.org\/10.1007\/978-981-97-7707-5_29","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024]]},"assertion":[{"value":"11 September 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"WISA","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Web Information Systems and Applications","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Yinchuan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"1 August 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"3 August 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"21","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"wisa22024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/conf.ccf.org.cn\/web\/html7\/index.html?globalId=m1216704987858604032171012667439&type=1","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}