{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,16]],"date-time":"2026-03-16T18:30:55Z","timestamp":1773685855311,"version":"3.50.1"},"reference-count":50,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2025,8,1]],"date-time":"2025-08-01T00:00:00Z","timestamp":1754006400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2025,8,1]],"date-time":"2025-08-01T00:00:00Z","timestamp":1754006400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2025,6,2]],"date-time":"2025-06-02T00:00:00Z","timestamp":1748822400000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/creativecommons.org\/licenses\/by-nc\/4.0\/"}],"funder":[{"DOI":"10.13039\/100000002","name":"National Institutes of Health","doi-asserted-by":"publisher","award":["R01GM123558"],"award-info":[{"award-number":["R01GM123558"]}],"id":[{"id":"10.13039\/100000002","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Computers in Human Behavior: Artificial Humans"],"published-print":{"date-parts":[[2025,8]]},"DOI":"10.1016\/j.chbah.2025.100170","type":"journal-article","created":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T14:09:35Z","timestamp":1750255775000},"page":"100170","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":1,"special_numbering":"C","title":["Evaluating the Intelligence of large language models: A comparative study using verbal and visual IQ tests"],"prefix":"10.1016","volume":"5","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-7456-6133","authenticated-orcid":false,"given":"Sherif","family":"Abdelkarim","sequence":"first","affiliation":[]},{"given":"David","family":"Lu","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2623-3772","authenticated-orcid":false,"given":"Dora-Luz","family":"Flores","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6165-2526","authenticated-orcid":false,"given":"Susanne","family":"Jaeggi","sequence":"additional","affiliation":[]},{"given":"Pierre","family":"Baldi","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.chbah.2025.100170_b1","series-title":"Clinical knowledge and reasoning abilities of AI large language models in anesthesiology: A comparative study on the ABA exam","author":"Angel","year":"2023"},{"key":"10.1016\/j.chbah.2025.100170_b2","article-title":"Performance of large language models on pharmacy exam: A comparative assessment using the NAPLEX","author":"Angel","year":"2023","journal-title":"bioRxiv"},{"key":"10.1016\/j.chbah.2025.100170_b3","series-title":"Claude 3","author":"Anthropic","year":"2024"},{"key":"10.1016\/j.chbah.2025.100170_b4","series-title":"Neural-symbolic learning and reasoning: A survey and interpretation","author":"Besold","year":"2017"},{"key":"10.1016\/j.chbah.2025.100170_b5","first-page":"1877","article-title":"Language models are few-shot learners","volume":"vol. 33","author":"Brown","year":"2020"},{"key":"10.1016\/j.chbah.2025.100170_b6","doi-asserted-by":"crossref","first-page":"404","DOI":"10.1037\/0033-295X.97.3.404","article-title":"What one intelligence test measures: A theoretical account of the processing in the raven progressive matrices test","volume":"97","author":"Carpenter","year":"1990","journal-title":"Psychological Review"},{"key":"10.1016\/j.chbah.2025.100170_b7","series-title":"Measuring intelligence with the culture fair tests","author":"Cattell","year":"1960"},{"key":"10.1016\/j.chbah.2025.100170_b8","series-title":"The twelfth international conference on learning representations","article-title":"Teaching large language models to self-debug","author":"Chen","year":"2024"},{"key":"10.1016\/j.chbah.2025.100170_b9","series-title":"Theoretical limitations of multi-layer transformer","author":"Chen","year":"2024"},{"key":"10.1016\/j.chbah.2025.100170_b10","series-title":"Evaluating large language models trained on code","author":"Chen","year":"2021"},{"key":"10.1016\/j.chbah.2025.100170_b11","series-title":"Think you have solved question answering? Try ARC, the AI2 reasoning challenge","author":"Clark","year":"2018"},{"key":"10.1016\/j.chbah.2025.100170_b12","series-title":"Training verifiers to solve math word problems","author":"Cobbe","year":"2021"},{"key":"10.1016\/j.chbah.2025.100170_b13","series-title":"Evaluating language models for mathematics through interactions","author":"Collins","year":"2023"},{"key":"10.1016\/j.chbah.2025.100170_b14","unstructured":"Dua, D., Wang, Y., Dasigi, P., Stanovsky, G., Singh, S., & Gardner, M. (2019). DROP: A Reading Comprehension Benchmark Requiring Discrete Reasoning Over Paragraphs. In Proceedings of NAACL-HLT (pp. 2368\u20132378)."},{"key":"10.1016\/j.chbah.2025.100170_b15","series-title":"Proceedings of the 37th international conference on neural information processing systems","article-title":"Faith and fate: limits of transformers on compositionality","author":"Dziri","year":"2023"},{"key":"10.1016\/j.chbah.2025.100170_b16","series-title":"Thirty-seventh conference on neural information processing systems datasets and benchmarks track","article-title":"Mathematical capabilities of chatGPT","author":"Frieder","year":"2023"},{"key":"10.1016\/j.chbah.2025.100170_b17","doi-asserted-by":"crossref","DOI":"10.3389\/fpsyg.2019.02857","article-title":"The looking glass for intelligence quotient tests: The interplay of motivation, cognitive functioning, and affect","volume":"10","author":"Ganuthula","year":"2019","journal-title":"Frontiers in Psychology"},{"key":"10.1016\/j.chbah.2025.100170_b18","series-title":"PAL: Program-aided language models","author":"Gao","year":"2023"},{"key":"10.1016\/j.chbah.2025.100170_b19","series-title":"Gemini: A family of highly capable multimodal models","author":"Gemini Team Google Rohan Anil","year":"2023"},{"key":"10.1016\/j.chbah.2025.100170_b20","series-title":"International conference on learning representations","article-title":"Measuring massive multitask language understanding","author":"Hendrycks","year":"2021"},{"key":"10.1016\/j.chbah.2025.100170_b21","series-title":"Measuring mathematical problem solving with the MATH dataset","author":"Hendrycks","year":"2021"},{"key":"10.1016\/j.chbah.2025.100170_b22","series-title":"Mistral 7B","author":"Jiang","year":"2023"},{"key":"10.1016\/j.chbah.2025.100170_b23","series-title":"Scaling laws for neural language models","author":"Kaplan","year":"2020"},{"key":"10.1016\/j.chbah.2025.100170_b24","series-title":"Capabilities of large language models in control engineering: A benchmark study on GPT-4, Claude 3 Opus, and Gemini 1.0 Ultra","author":"Kevian","year":"2024"},{"key":"10.1016\/j.chbah.2025.100170_b25","series-title":"Capabilities of large language models in control engineering: A benchmark study on GPT-4, Claude 3 Opus, and Gemini 1.0 Ultra","author":"Kevian","year":"2024"},{"key":"10.1016\/j.chbah.2025.100170_b26","series-title":"Administration of the text-based portions of a general IQ test to five different large language models","author":"King","year":"2023"},{"key":"10.1016\/j.chbah.2025.100170_b27","first-page":"3843","article-title":"Solving quantitative reasoning problems with language models","volume":"vol. 35","author":"Lewkowycz","year":"2022"},{"key":"10.1016\/j.chbah.2025.100170_b28","unstructured":"Lewkowycz, A., Slone, A., Andreassen, A., Freeman, D., Dyer, E. S., Mishra, G., et al. (2022). Beyond the Imitation Game: Quantifying and extrapolating the capabilities of language models: Tech. rep.."},{"key":"10.1016\/j.chbah.2025.100170_b29","series-title":"Holistic evaluation of language models","author":"Liang","year":"2023"},{"key":"10.1016\/j.chbah.2025.100170_b30","series-title":"TruthfulQA: Measuring how models mimic human falsehoods","author":"Lin","year":"2021"},{"key":"10.1016\/j.chbah.2025.100170_b31","doi-asserted-by":"crossref","unstructured":"Liu, L., Liu, Y., Yu, B., Zhang, C., Huang, Q., & Fu, J. (2020). LogiQA: A Challenge Dataset for Machine Reading Comprehension with Logical Reasoning. In Proceedings of the twenty-ninth international joint conference on artificial intelligence (pp. 3622\u20133628).","DOI":"10.24963\/ijcai.2020\/501"},{"issue":"1","key":"10.1016\/j.chbah.2025.100170_b32","doi-asserted-by":"crossref","first-page":"e7","DOI":"10.1213\/ANE.0000000000007322","article-title":"Large Language Models and the American Board of Anesthesiology Examination","volume":"140","author":"Macario","year":"2025","journal-title":"Anesthesia & Analgesia"},{"key":"10.1016\/j.chbah.2025.100170_b33","series-title":"Self-refine: Iterative refinement with self-feedback","author":"Madaan","year":"2023"},{"key":"10.1016\/j.chbah.2025.100170_b34","series-title":"Mistral models","author":"Mistral AI","year":"2024"},{"key":"10.1016\/j.chbah.2025.100170_b35","series-title":"GPT-4 technical report","author":"OpenAI","year":"2023"},{"key":"10.1016\/j.chbah.2025.100170_b36","series-title":"GPT3.5 turbo","author":"OpenAI","year":"2023"},{"key":"10.1016\/j.chbah.2025.100170_b37","article-title":"Improving language understanding by generative pre-training","author":"Radford","year":"2018","journal-title":"OpenAI Blog"},{"issue":"8","key":"10.1016\/j.chbah.2025.100170_b38","first-page":"9","article-title":"Language models are unsupervised multitask learners","volume":"1","author":"Radford","year":"2019","journal-title":"OpenAI Blog"},{"key":"10.1016\/j.chbah.2025.100170_b39","doi-asserted-by":"crossref","unstructured":"Sakaguchi, K., Zhong, R., Chen, T., & Peng, Y. (2020). WinoGrande: An Adversarial Winograd Schema Challenge at Scale. In Proceedings of the AAAI conference on artificial intelligence (pp. 8734\u20138741).","DOI":"10.1609\/aaai.v34i05.6399"},{"key":"10.1016\/j.chbah.2025.100170_b40","series-title":"Self-scoring tests","article-title":"Self-scoring IQ tests","author":"Serebriakoff","year":"1996"},{"key":"10.1016\/j.chbah.2025.100170_b41","first-page":"8634","article-title":"Reflexion: language agents with verbal reinforcement learning","volume":"vol. 36","author":"Shinn","year":"2023"},{"key":"10.1016\/j.chbah.2025.100170_b42","series-title":"Annual meeting of the association for computational linguistics","article-title":"Challenging BIG-bench tasks and whether chain-of-thought can solve them","author":"Suzgun","year":"2022"},{"key":"10.1016\/j.chbah.2025.100170_b43","series-title":"Llama 2: Open foundation and fine-tuned chat models","author":"Touvron","year":"2023"},{"key":"10.1016\/j.chbah.2025.100170_b44","series-title":"Self-consistency improves chain of thought reasoning in language models","author":"Wang","year":"2023"},{"key":"10.1016\/j.chbah.2025.100170_b45","series-title":"Wechsler adult intelligence scale: WAIS-IV; technical and interpretive manual","author":"Wechsler","year":"2008"},{"key":"10.1016\/j.chbah.2025.100170_b46","series-title":"Proceedings of the 36th international conference on neural information processing systems","article-title":"Chain-of-thought prompting elicits reasoning in large language models","author":"Wei","year":"2022"},{"key":"10.1016\/j.chbah.2025.100170_b47","series-title":"A comparative study of open-source large language models, GPT-4 and claude 2: Multiple-choice test taking in nephrology","author":"Wu","year":"2023"},{"key":"10.1016\/j.chbah.2025.100170_b48","series-title":"HellaSwag: Can a machine really finish your sentence?","author":"Zellers","year":"2019"},{"key":"10.1016\/j.chbah.2025.100170_b49","series-title":"How far are we from intelligent visual deductive reasoning?","author":"Zhang","year":"2024"},{"key":"10.1016\/j.chbah.2025.100170_b50","doi-asserted-by":"crossref","unstructured":"Zhang, C., Gao, F., Wang, C., Xu, Y., & Zhu, S.-C. (2019). RAVEN: A Dataset for Relational and Analogical Visual REasoNing. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 5317\u20135327).","DOI":"10.1109\/CVPR.2019.00546"}],"container-title":["Computers in Human Behavior: Artificial Humans"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S2949882125000544?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S2949882125000544?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2025,11,21]],"date-time":"2025-11-21T05:31:50Z","timestamp":1763703110000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S2949882125000544"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,8]]},"references-count":50,"alternative-id":["S2949882125000544"],"URL":"https:\/\/doi.org\/10.1016\/j.chbah.2025.100170","relation":{},"ISSN":["2949-8821"],"issn-type":[{"value":"2949-8821","type":"print"}],"subject":[],"published":{"date-parts":[[2025,8]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Evaluating the Intelligence of large language models: A comparative study using verbal and visual IQ tests","name":"articletitle","label":"Article Title"},{"value":"Computers in Human Behavior: Artificial Humans","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.chbah.2025.100170","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2025 The Authors. Published by Elsevier Inc.","name":"copyright","label":"Copyright"}],"article-number":"100170"}}