{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,21]],"date-time":"2026-03-21T07:14:16Z","timestamp":1774077256204,"version":"3.50.1"},"reference-count":43,"publisher":"Association for Natural Language Processing","issue":"1","content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["Journal of Natural Language Processing"],"published-print":{"date-parts":[[2026]]},"DOI":"10.5715\/jnlp.33.51","type":"journal-article","created":{"date-parts":[[2026,3,14]],"date-time":"2026-03-14T22:13:09Z","timestamp":1773526389000},"page":"51-75","source":"Crossref","is-referenced-by-count":0,"title":["Rethinking the Evaluation Methods of LLMs' Grammatical Knowledge","\u5927\u898f\u6a21\u8a00\u8a9e\u30e2\u30c7\u30eb\u306e\u6587\u6cd5\u77e5\u8b58\u8a55\u4fa1\u65b9\u6cd5\u306e\u518d\u691c\u8a0e"],"prefix":"10.5715","volume":"33","author":[{"given":"Yusuke","family":"Ide","sequence":"first","affiliation":[{"name":"Nara Institute of Science and Technology"}]},{"given":"Yuto","family":"Nishida","sequence":"additional","affiliation":[{"name":"Nara Institute of Science and Technology"}]},{"given":"Justin","family":"Vasselli","sequence":"additional","affiliation":[{"name":"Nara Institute of Science and Technology"}]},{"given":"Miyu","family":"Oba","sequence":"additional","affiliation":[{"name":"Nara Institute of Science and Technology"}]},{"given":"Yusuke","family":"Sakai","sequence":"additional","affiliation":[{"name":"Nara Institute of Science and Technology"}]},{"given":"Hidetaka","family":"Kamigaito","sequence":"additional","affiliation":[{"name":"Nara Institute of Science and Technology"}]},{"given":"Taro","family":"Watanabe","sequence":"additional","affiliation":[{"name":"Nara Institute of Science and Technology"}]}],"member":"3685","reference":[{"key":"1","unstructured":"Brown, T., Mann, B., Ryder, N., Subbiah, M., Kaplan, J. D., Dhariwal, P., Neelakantan, A., Shyam, P., Sastry, G., Askell, A., Agarwal, S., Herbert-Voss, A., Krueger, G., Henighan, T., Child, R., Ramesh, A., Ziegler, D., Wu, J., Winter, C., Hesse, C., Chen, M., Sigler, E., Litwin, M., Gray, S., Chess, B., Clark, J., Berner, C., McCandlish, S., Radford, A., Sutskever, I., and Amodei, D. (2020). \u201cLanguage Models Are Few-Shot Learners.\u201d In Larochelle, H., Ranzato, M., Hadsell, R., Balcan, M., and Lin, H. (Eds.), <i>Advances in Neural Information Processing Systems<\/i>, Vol. 33, pp. 1877\u20131901. Curran Associates, Inc."},{"key":"2","doi-asserted-by":"crossref","unstructured":"Haga, A., Sugawara, S., Fukatsu, A., Oba, M., Ouchi, H., Watanabe, T., and Oseki, Y. (2024). \u201cModeling Overregularization in Children with Small Language Models.\u201d In Ku, L.-W., Martins, A., and Srikumar, V. (Eds.), <i>Findings of the Association for Computational Linguistics: ACL 2024<\/i>, pp. 14532\u201314550, Bangkok, Thailand. Association for Computational Linguistics.","DOI":"10.18653\/v1\/2024.findings-acl.865"},{"key":"3","unstructured":"Hendrycks, D., Burns, C., Basart, S., Zou, A., Mazeika, M., Song, D., and Steinhardt, J. (2021). \u201cMeasuring Massive Multitask Language Understanding.\u201d In <i>International Conference on Learning Representations<\/i>."},{"key":"4","doi-asserted-by":"crossref","unstructured":"Hu, J. and Levy, R. (2023). \u201cPrompting Is Not a Substitute for Probability Measurements in Large Language Models.\u201d In Bouamor, H., Pino, J., and Bali, K. (Eds.), <i>Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing<\/i>, pp. 5040\u20135060, Singapore. Association for Computational Linguistics.","DOI":"10.18653\/v1\/2023.emnlp-main.306"},{"key":"5","doi-asserted-by":"crossref","unstructured":"Ide, Y., Nishida, Y., Vasselli, J., Oba, M., Sakai, Y., Kamigaito, H., and Watanabe, T. (2025). \u201cHow to Make the Most of LLMs\u2019 Grammatical Knowledge for Acceptability Judgments.\u201d In Chiruzzo, L., Ritter, A., and Wang, L. (Eds.), <i>Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)<\/i>, pp. 7416\u20137432, Albuquerque, New Mexico. Association for Computational Linguistics.","DOI":"10.18653\/v1\/2025.naacl-long.380"},{"key":"6","doi-asserted-by":"crossref","unstructured":"Ivanova, A. A., Sathe, A., Lipkin, B., Kumar, U., Radkani, S., Clark, T. H., Kauf, C., Hu, J., Pramod, R., Grand, G., et al. (2024). \u201cElements of World Knowledge (EWOK): A Cognition-inspired Framework for Evaluating Basic World Knowledge in Language Models.\u201d <i>arXiv preprint arXiv:2405.09605<\/i>.","DOI":"10.1162\/TACL.a.38"},{"key":"7","doi-asserted-by":"crossref","unstructured":"Iwamoto, R., Yoshida, I., Kanayama, H., Ohko, T., and Muraoka, M. (2023). \u201cIncorporating Syntactic Knowledge into Pre-trained Language Model using Optimization for Overcoming Catastrophic Forgetting.\u201d In Bouamor, H., Pino, J., and Bali, K. (Eds.), <i>Findings of the Association for Computational Linguistics: EMNLP 2023<\/i>, pp. 10981\u201310993, Singapore. Association for Computational Linguistics.","DOI":"10.18653\/v1\/2023.findings-emnlp.732"},{"key":"8","unstructured":"Jiang, A. Q., Sablayrolles, A., Roux, A., Mensch, A., Savary, B., Bamford, C., Chaplot, D. S., de las Casas, D., Hanna, E. B., Bressand, F., Lengyel, G., Bour, G., Lample, G., Lavaud, L. R., Saulnier, L., Lachaux, M.-A., Stock, P., Subramanian, S., Yang, S., Antoniak, S., Scao, T. L., Gervet, T., Lavril, T., Wang, T., Lacroix, T., and Sayed, W. E. (2024). \u201cMixtral of Experts.\u201d <i>arXiv preprint arXiv:2401.04088<\/i>."},{"key":"9","unstructured":"Koehn, P. (2004). \u201cStatistical Significance Tests for Machine Translation Evaluation.\u201d In Lin, D. and Wu, D. (Eds.), <i>Proceedings of the 2004 Conference on Empirical Methods in Natural Language Processing<\/i>, pp. 388\u2013395, Barcelona, Spain. Association for Computational Linguistics."},{"key":"10","doi-asserted-by":"crossref","unstructured":"Lau, J. H., Armendariz, C., Lappin, S., Purver, M., and Shu, C. (2020). \u201cHow Furiously Can Colorless Green Ideas Sleep? Sentence Acceptability in Context.\u201d <i>Transactions of the Association for Computational Linguistics<\/i>, 8, pp. 296\u2013310.","DOI":"10.1162\/tacl_a_00315"},{"key":"11","doi-asserted-by":"crossref","unstructured":"Lau, J. H., Clark, A., and Lappin, S. (2017). \u201cGrammaticality, Acceptability, and Probability: A Probabilistic View of Linguistic Knowledge.\u201d <i>Cognitive Science<\/i>, 41 (5), pp. 1202\u20131241.","DOI":"10.1111\/cogs.12414"},{"key":"12","doi-asserted-by":"crossref","unstructured":"Lewis, M., Liu, Y., Goyal, N., Ghazvininejad, M., Mohamed, A., Levy, O., Stoyanov, V., and Zettlemoyer, L. (2020). \u201cBART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension.\u201d In Jurafsky, D., Chai, J., Schluter, N., and Tetreault, J. (Eds.), <i>Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics<\/i>, pp. 7871\u20137880, Online. Association for Computational Linguistics.","DOI":"10.18653\/v1\/2020.acl-main.703"},{"key":"13","doi-asserted-by":"crossref","unstructured":"Linzen, T., Dupoux, E., and Goldberg, Y. (2016). \u201cAssessing the Ability of LSTMs to Learn Syntax-Sensitive Dependencies.\u201d <i>Transactions of the Association for Computational Linguistics<\/i>, 4, pp. 521\u2013535.","DOI":"10.1162\/tacl_a_00115"},{"key":"14","doi-asserted-by":"crossref","unstructured":"Liu, P., Yuan, W., Fu, J., Jiang, Z., Hayashi, H., and Neubig, G. (2023). \u201cPre-train, Prompt, and Predict: A Systematic Survey of Prompting Methods in Natural Language Processing.\u201d <i>ACM Computing Surveys<\/i>, 55 (9), pp. 1\u201335.","DOI":"10.1145\/3560815"},{"key":"15","unstructured":"Liu, Y., Ott, M., Goyal, N., Du, J., Joshi, M., Chen, D., Levy, O., Lewis, M., Zettlemoyer, L., and Stoyanov, V. (2019). \u201cRoBERTa: A Robustly Optimized BERT Pretraining Approach.\u201d <i>arXiv preprint arXiv:1907.11692<\/i>."},{"key":"16","doi-asserted-by":"crossref","unstructured":"Marvin, R. and Linzen, T. (2018). \u201cTargeted Syntactic Evaluation of Language Models.\u201d In Riloff, E., Chiang, D., Hockenmaier, J., and Tsujii, J. (Eds.), <i>Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing<\/i>, pp. 1192\u20131202, Brussels, Belgium. Association for Computational Linguistics.","DOI":"10.18653\/v1\/D18-1151"},{"key":"17","unstructured":"Meta (2024). \u201cIntroducing Meta Llama 3: The Most Capable Openly Available LLM to Date.\u201d https:\/\/ai.meta.com\/blog\/meta-llama-3\/."},{"key":"18","doi-asserted-by":"crossref","unstructured":"Mueller, A., Nicolai, G., Petrou-Zeniou, P., Talmina, N., and Linzen, T. (2020). \u201cCross-Linguistic Syntactic Evaluation of Word Prediction Models.\u201d In Jurafsky, D., Chai, J., Schluter, N., and Tetreault, J. (Eds.), <i>Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics<\/i>, pp. 5523\u20135539, Online. Association for Computational Linguistics.","DOI":"10.18653\/v1\/2020.acl-main.490"},{"key":"19","unstructured":"Pauls, A. and Klein, D. (2012). \u201cLarge-Scale Syntactic Language Modeling with Treelets.\u201d In Li, H., Lin, C.-Y., Osborne, M., Lee, G. G., and Park, J. C. (Eds.), <i>Proceedings of the 50th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)<\/i>, pp. 959\u2013968, Jeju Island, Korea. Association for Computational Linguistics."},{"key":"20","doi-asserted-by":"crossref","unstructured":"Peng, K., Ding, L., Zhong, Q., Shen, L., Liu, X., Zhang, M., Ouyang, Y., and Tao, D. (2023). \u201cTowards Making the Most of ChatGPT for Machine Translation.\u201d In Bouamor, H., Pino, J., and Bali, K. (Eds.), <i>Findings of the Association for Computational Linguistics: EMNLP 2023<\/i>, pp. 5622\u20135633, Singapore. Association for Computational Linguistics.","DOI":"10.18653\/v1\/2023.findings-emnlp.373"},{"key":"21","doi-asserted-by":"crossref","unstructured":"Pezeshkpour, P. and Hruschka, E. (2023). \u201cLarge Language Models Sensitivity to The Order of Options in Multiple-Choice Questions.\u201d <i>arXiv preprint arXiv:2308.11483<\/i>.","DOI":"10.18653\/v1\/2024.findings-naacl.130"},{"key":"22","doi-asserted-by":"crossref","unstructured":"Pham, T., Bui, T., Mai, L., and Nguyen, A. (2021). \u201cOut of Order: How Important Is the Sequential Order of Words in a Sentence in Natural Language Understanding Tasks?\u201d In Zong, C., Xia, F., Li, W., and Navigli, R. (Eds.), <i>Findings of the Association for Computational Linguistics: ACL-IJCNLP 2021<\/i>, pp. 1145\u20131160, Online. Association for Computational Linguistics.","DOI":"10.18653\/v1\/2021.findings-acl.98"},{"key":"23","unstructured":"Qwen Team (2024). \u201cIntroducing Qwen1.5.\u201d https:\/\/qwenlm.github.io\/blog\/qwen1.5\/."},{"key":"24","doi-asserted-by":"crossref","unstructured":"Rafailov, R., Sharma, A., Mitchell, E., Manning, C. D., Ermon, S., and Finn, C. (2023). \u201cDirect Preference Optimization: Your Language Model is Secretly a Reward Model.\u201d In Oh, A., Naumann, T., Globerson, A., Saenko, K., Hardt, M., and Levine, S. (Eds.), <i>Advances in Neural Information Processing Systems<\/i>, Vol. 36, pp. 53728\u201353741. Curran Associates, Inc.","DOI":"10.52202\/075280-2338"},{"key":"25","doi-asserted-by":"crossref","unstructured":"Sakai, Y., Nohejl, A., Hang, J., Kamigaito, H., and Watanabe, T. (2024). \u201cToward the Evaluation of Large Language Models Considering Score Variance across Instruction Templates.\u201d In Belinkov, Y., Kim, N., Jumelet, J., Mohebbi, H., Mueller, A., and Chen, H. (Eds.), <i>Proceedings of the 7th BlackboxNLP Workshop: Analyzing and Interpreting Neural Networks for NLP<\/i>, pp. 499\u2013529, Miami, Florida, US. Association for Computational Linguistics.","DOI":"10.18653\/v1\/2024.blackboxnlp-1.31"},{"key":"26","unstructured":"Sch\u00fctze, C. T. (2016). <i>The Empirical Base of Linguistics<\/i>. No. 2 in Classics in Linguistics. Language Science Press, Berlin."},{"key":"27","doi-asserted-by":"crossref","unstructured":"Sinha, K., Jia, R., Hupkes, D., Pineau, J., Williams, A., and Kiela, D. (2021a). \u201cMasked Language Modeling and the Distributional Hypothesis: Order Word Matters Pre-training for Little.\u201d In Moens, M.-F., Huang, X., Specia, L., and Yih, S. W.-t. (Eds.), <i>Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing<\/i>, pp. 2888\u20132913, Online and Punta Cana, Dominican Republic. Association for Computational Linguistics.","DOI":"10.18653\/v1\/2021.emnlp-main.230"},{"key":"28","doi-asserted-by":"crossref","unstructured":"Sinha, K., Parthasarathi, P., Pineau, J., and Williams, A. (2021b). \u201cUnNatural Language Inference.\u201d In Zong, C., Xia, F., Li, W., and Navigli, R. (Eds.), <i>Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers)<\/i>, pp. 7329\u20137346, Online. Association for Computational Linguistics.","DOI":"10.18653\/v1\/2021.acl-long.569"},{"key":"29","doi-asserted-by":"crossref","unstructured":"Someya, T. and Oseki, Y. (2023). \u201cJBLiMP: Japanese Benchmark of Linguistic Minimal Pairs.\u201d In Vlachos, A. and Augenstein, I. (Eds.), <i>Findings of the Association for Computational Linguistics: EACL 2023<\/i>, pp. 1581\u20131594, Dubrovnik, Croatia. Association for Computational Linguistics.","DOI":"10.18653\/v1\/2023.findings-eacl.117"},{"key":"30","doi-asserted-by":"crossref","unstructured":"Tjuatja, L., Neubig, G., Linzen, T., and Hao, S. (2025). \u201cWhat Goes into a LM Acceptability Judgment? Rethinking the Impact of Frequency and Length.\u201d In Chiruzzo, L., Ritter, A., and Wang, L. (Eds.), <i>Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)<\/i>, pp. 2173\u20132186, Albuquerque, New Mexico. Association for Computational Linguistics.","DOI":"10.18653\/v1\/2025.naacl-long.109"},{"key":"31","doi-asserted-by":"crossref","unstructured":"Ueda, N., Mita, M., Oka, T., and Komachi, M. (2024). \u201cToken-length Bias in Minimal-pair Paradigm Datasets.\u201d In Calzolari, N., Kan, M.-Y., Hoste, V., Lenci, A., Sakti, S., and Xue, N. (Eds.), <i>Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)<\/i>, pp. 16224\u201316236, Torino, Italia. ELRA and ICCL.","DOI":"10.63317\/38xk2sffxs5p"},{"key":"32","doi-asserted-by":"crossref","unstructured":"Wang, A., Singh, A., Michael, J., Hill, F., Levy, O., and Bowman, S. (2018). \u201cGLUE: A Multi-Task Benchmark and Analysis Platform for Natural Language Understanding.\u201d In Linzen, T., Chrupa\u019aa, G., and Alishahi, A. (Eds.), <i>Proceedings of the 2018 EMNLP Workshop BlackboxNLP: Analyzing and Interpreting Neural Networks for NLP<\/i>, pp. 353\u2013355, Brussels, Belgium. Association for Computational Linguistics.","DOI":"10.18653\/v1\/W18-5446"},{"key":"33","doi-asserted-by":"crossref","unstructured":"Warstadt, A., Parrish, A., Liu, H., Mohananey, A., Peng, W., Wang, S.-F., and Bowman, S. R. (2020). \u201cBLiMP: The Benchmark of Linguistic Minimal Pairs for English.\u201d <i>Transactions of the Association for Computational Linguistics<\/i>, 8, pp. 377\u2013392.","DOI":"10.1162\/tacl_a_00321"},{"key":"34","doi-asserted-by":"crossref","unstructured":"Warstadt, A., Singh, A., and Bowman, S. R. (2019). \u201cNeural Network Acceptability Judgments.\u201d <i>Transactions of the Association for Computational Linguistics<\/i>, 7, pp. 625\u2013641.","DOI":"10.1162\/tacl_a_00290"},{"key":"35","unstructured":"Wei, J., Bosma, M., Zhao, V., Guu, K., Yu, A. W., Lester, B., Du, N., Dai, A. M., and Le, Q. V. (2022). \u201cFinetuned Language Models are Zero-Shot Learners.\u201d In <i>International Conference on Learning Representations<\/i>."},{"key":"36","unstructured":"Willard, B. T. and Louf, R. (2023). \u201cEfficient Guided Generation for Large Language Models.\u201d <i>arXiv preprint arXiv:2307.09702<\/i>."},{"key":"37","unstructured":"Wu, Y., Schuster, M., Chen, Z., Le, Q. V., Norouzi, M., Macherey, W., Krikun, M., Cao, Y., Gao, Q., Macherey, K., Klingner, J., Shah, A., Johnson, M., Liu, X., Kaiser, \u023d., Gouws, S., Kato, Y., Kudo, T., Kazawa, H., Stevens, K., Kurian, G., Patil, N., Wang, W., Young, C., Smith, J., Riesa, J., Rudnick, A., Vinyals, O., Corrado, G., Hughes, M., and Dean, J. (2016). \u201cGoogle\u2019s Neural Machine Translation System: Bridging the Gap between Human and Machine Translation.\u201d <i>arXiv preprint arXiv:1609.08144<\/i>."},{"key":"38","doi-asserted-by":"crossref","unstructured":"Xiang, B., Yang, C., Li, Y., Warstadt, A., and Kann, K. (2021). \u201cCLiMP: A Benchmark for Chinese Language Model Evaluation.\u201d In Merlo, P., Tiedemann, J., and Tsarfaty, R. (Eds.), <i>Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics: Main Volume<\/i>, pp. 2784\u20132790, Online. Association for Computational Linguistics.","DOI":"10.18653\/v1\/2021.eacl-main.242"},{"key":"39","unstructured":"Young, A., Chen, B., Li, C., Huang, C., Zhang, G., Zhang, G., Li, H., Zhu, J., Chen, J., Chang, J., Yu, K., Liu, P., Liu, Q., Yue, S., Yang, S., Yang, S., Yu, T., Xie, W., Huang, W., Hu, X., Ren, X., Niu, X., Nie, P., Xu, Y., Liu, Y., Wang, Y., Cai, Y., Gu, Z., Liu, Z., and Dai, Z. (2024). \u201cYi: Open Foundation Models by 01.AI.\u201d <i>arXiv preprint arXiv:2403.04652<\/i>."},{"key":"40","doi-asserted-by":"crossref","unstructured":"Zhang, Z., Liu, Y., Huang, W., Mao, J., Wang, R., and Hu, H. (2024). \u201cMELA: Multilingual Evaluation of Linguistic Acceptability.\u201d In Ku, L.-W., Martins, A., and Srikumar, V. (Eds.), <i>Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)<\/i>, pp. 2658\u20132674, Bangkok, Thailand. Association for Computational Linguistics.","DOI":"10.18653\/v1\/2024.acl-long.146"},{"key":"41","unstructured":"Zheng, C., Zhou, H., Meng, F., Zhou, J., and Huang, M. (2024). \u201cLarge Language Models Are Not Robust Multiple Choice Selectors.\u201d In <i>The 12th International Conference on Learning Representations<\/i>."},{"key":"42","doi-asserted-by":"crossref","unstructured":"Zheng, L., Chiang, W.-L., Sheng, Y., Zhuang, S., Wu, Z., Zhuang, Y., Lin, Z., Li, Z., Li, D., Xing, E. P., Zhang, H., Gonzalez, J. E., and Stoica, I. (2023). \u201cJudging LLM-as-a-Judge with MT-Bench and Chatbot Arena.\u201d <i>arXiv preprint arXiv:2306.05685<\/i>.","DOI":"10.52202\/075280-2020"},{"key":"43","doi-asserted-by":"crossref","unstructured":"Zhong, M., Liu, Y., Yin, D., Mao, Y., Jiao, Y., Liu, P., Zhu, C., Ji, H., and Han, J. (2022). \u201cTowards a Unified Multi-Dimensional Evaluator for Text Generation.\u201d In Goldberg, Y., Kozareva, Z., and Zhang, Y. (Eds.), <i>Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing<\/i>, pp. 2023\u20132038, Abu Dhabi, United Arab Emirates. Association for Computational Linguistics.","DOI":"10.18653\/v1\/2022.emnlp-main.131"}],"container-title":["Journal of Natural Language Processing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/www.jstage.jst.go.jp\/article\/jnlp\/33\/1\/33_51\/_pdf","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,21]],"date-time":"2026-03-21T03:53:54Z","timestamp":1774065234000},"score":1,"resource":{"primary":{"URL":"https:\/\/www.jstage.jst.go.jp\/article\/jnlp\/33\/1\/33_51\/_article\/-char\/ja\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026]]},"references-count":43,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2026]]}},"URL":"https:\/\/doi.org\/10.5715\/jnlp.33.51","relation":{},"ISSN":["1340-7619","2185-8314"],"issn-type":[{"value":"1340-7619","type":"print"},{"value":"2185-8314","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026]]}}}