{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,9]],"date-time":"2026-06-09T14:49:26Z","timestamp":1781016566070,"version":"3.54.1"},"reference-count":48,"publisher":"IEEE","license":[{"start":{"date-parts":[[2024,12,15]],"date-time":"2024-12-15T00:00:00Z","timestamp":1734220800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,12,15]],"date-time":"2024-12-15T00:00:00Z","timestamp":1734220800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024,12,15]]},"DOI":"10.1109\/bigdata62323.2024.10825051","type":"proceedings-article","created":{"date-parts":[[2025,1,16]],"date-time":"2025-01-16T18:31:23Z","timestamp":1737052283000},"page":"3313-3321","source":"Crossref","is-referenced-by-count":7,"title":["Dynamic Intelligence Assessment: Benchmarking LLMs on the Road to AGI with a Focus on Model Confidence"],"prefix":"10.1109","author":[{"given":"Norbert","family":"Tihanyi","sequence":"first","affiliation":[{"name":"Technology Innovation Institute,Abu Dhabi,United Arab Emirates"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Tamas","family":"Bisztray","sequence":"additional","affiliation":[{"name":"University of Oslo,Oslo,Norway"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Richard A.","family":"Dubniczky","sequence":"additional","affiliation":[{"name":"E&#x00F6;tv&#x00F6;s Lor&#x00E1;nd University,Budapest,Hungary"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Rebeka","family":"Toth","sequence":"additional","affiliation":[{"name":"University of Oslo,Oslo,Norway"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Bertalan","family":"Borsos","sequence":"additional","affiliation":[{"name":"E&#x00F6;tv&#x00F6;s Lor&#x00E1;nd University,Budapest,Hungary"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Bilel","family":"Cherif","sequence":"additional","affiliation":[{"name":"Technology Innovation Institute,Abu Dhabi,United Arab Emirates"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Ridhi","family":"Jain","sequence":"additional","affiliation":[{"name":"Technology Innovation Institute,Abu Dhabi,United Arab Emirates"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Lajos","family":"Muzsai","sequence":"additional","affiliation":[{"name":"E&#x00F6;tv&#x00F6;s Lor&#x00E1;nd University,Budapest,Hungary"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Mohamed Amine","family":"Ferrag","sequence":"additional","affiliation":[{"name":"University of Guelma,Guelma,Algeria"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Ryan","family":"Marinelli","sequence":"additional","affiliation":[{"name":"University of Oslo,Oslo,Norway"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Lucas C.","family":"Cordeiro","sequence":"additional","affiliation":[{"name":"The University of Manchester,Manchester,United Kingdom"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Merouane","family":"Debbah","sequence":"additional","affiliation":[{"name":"Khalifa University,Abu Dhabi,United Arab Emirates"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Vasileios","family":"Mavroeidis","sequence":"additional","affiliation":[{"name":"University of Oslo,Oslo,Norway"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Audun","family":"J\u00f8sang","sequence":"additional","affiliation":[{"name":"University of Oslo,Oslo,Norway"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.7551\/mitpress\/6928.003.0012"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1016\/S0004-3702(01)00129-1"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.2307\/j.ctv22pzxz1"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1073\/pnas.79.8.2554.PMC346238.PMID6953413"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1038\/323533a0"},{"key":"ref6","article-title":"Attention is all you need","volume-title":"Advances in Neural Information Processing Systems","volume":"30","author":"Vaswani","year":"2017"},{"key":"ref7","article-title":"Bert: Pretraining of deep bidirectional transformers for language understanding","author":"Devlin","year":"2018"},{"key":"ref8","first-page":"1877","article-title":"Language models are few-shot learners","volume-title":"Advances in Neural Information Processing Systems","volume":"33","author":"Brown","year":"2020"},{"key":"ref9","doi-asserted-by":"crossref","DOI":"10.18653\/v1\/W18-5446","article-title":"GLUE: A multi-task benchmark and analysis platform for0 natural language understanding","volume-title":"Proceedings of the International Conference on Learning Representations (ICLR)","author":"Wang"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/d16-1264"},{"key":"ref11","article-title":"Evaluating large language models trained on code","author":"Chen","year":"2021"},{"key":"ref12","article-title":"Benchmark self-evolving: A multi-agent framework for dynamic LLM evaluation","author":"Wang","year":"2024"},{"key":"ref13","article-title":"Turbulence: Systematically and automatically testing instruction-tuned large language models for code","author":"Honarvar","year":"2023"},{"key":"ref14","article-title":"GSM-Symbolic: Understanding the limitations of mathematical reasoning in large language models","author":"Mirzadeh","year":"2024"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.3115\/1073083.1073135"},{"key":"ref16","first-page":"74","article-title":"Rouge: A package for automatic evaluation of summaries","author":"Lin","year":"2004","journal-title":"Text summarization branches out"},{"key":"ref17","article-title":"Bertscore: Evaluating text generation with BERT","author":"Zhang","year":"2019"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/sp.2017.49"},{"key":"ref19","article-title":"Towards out-of-distribution generalization: A survey","author":"Liu","year":"2021"},{"key":"ref20","article-title":"Cyberseceval 2: A wide-ranging cybersecurity evaluation suite for large language models","author":"Bhatt","year":"2024"},{"key":"ref21","doi-asserted-by":"crossref","DOI":"10.18653\/v1\/2023.acl-long.773","article-title":"ReCode: Robustness evaluation of code generation models","volume-title":"ACL 2023","author":"Wang"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/csr61664.2024.10679494"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1145\/3597503.3623316"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/ICSE43902.2021.00107"},{"key":"ref25","article-title":"Grok-2 beta release","year":"2024"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00266"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1259"},{"key":"ref28","article-title":"MINT: Evaluating LLMs in multi-turn interaction with tools and language feedback","author":"Wang","year":"2023"},{"key":"ref29","article-title":"Aligning AI with shared human values","volume-title":"Proceedings of the International Conference on Learning Representations (ICLR)","author":"Hendrycks"},{"key":"ref30","article-title":"Measuring massive multitask language understanding","volume-title":"Proceedings of the International Conference on Learning Representations (ICLR)","author":"Hendrycks"},{"key":"ref31","article-title":"Mmlupro: A more robust and challenging multi-task language understanding benchmark","author":"Wang","year":"2024"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00913"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/WACV48630.2021.00225"},{"key":"ref34","article-title":"Program synthesis with large language models","author":"Austin","year":"2021"},{"key":"ref35","article-title":"Measuring mathematical problem solving with the MATH dataset","volume-title":"Advances in Neural Information Processing Systems (NeurIPS)","author":"Hendrycks"},{"key":"ref36","article-title":"MathVista: Evaluating mathematical reasoning of foundation models in visual contexts","volume-title":"Proceedings of the International Conference on Learning Representations (ICLR)","author":"Lu"},{"key":"ref37","article-title":"Measuring coding challenge competence with APPS","volume-title":"Advances in Neural Information Processing Systems (NeurIPS)","author":"Hendrycks"},{"key":"ref38","article-title":"Testing robustness against unforeseen adversaries","author":"Kaufmann","year":"2019"},{"key":"ref39","first-page":"284","article-title":"Synthesizing robust adversarial examples","volume-title":"Proceedings of the International Conference on Machine Learning","author":"Athalye"},{"key":"ref40","article-title":"Piloting copilot and codex: Hot temperature, cold prompts, or black magic?","author":"D\u00f6derlein","year":"2022"},{"key":"ref41","article-title":"SecQA: A Concise Question-Answering Dataset for Evaluating Large Language Models in Computer Security","author":"Liu","year":"2023"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/SP46214.2022.9833571"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1145\/3617555.3617874"},{"key":"ref44","article-title":"Training verifiers to solve math word problems","author":"Cobbe","year":"2021"},{"key":"ref45","article-title":"M4LE: A multi-ability multi-range multi-task multi-domain longcontext evaluation benchmark for large language models","author":"Kwan","year":"2023"},{"key":"ref46","article-title":"Bamboo: A comprehensive benchmark for evaluating long text modeling capacities of large language models","author":"Dong","year":"2023"},{"key":"ref47","article-title":"LongBench: A bilingual, multitask benchmark for long context understanding","author":"Bai","year":"2023"},{"key":"ref48","article-title":"Milebench: Benchmarking MLLMs in long context","author":"Song","year":"2024"}],"event":{"name":"2024 IEEE International Conference on Big Data (BigData)","location":"Washington, DC, USA","start":{"date-parts":[[2024,12,15]]},"end":{"date-parts":[[2024,12,18]]}},"container-title":["2024 IEEE International Conference on Big Data (BigData)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10824975\/10824942\/10825051.pdf?arnumber=10825051","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,1,17]],"date-time":"2025-01-17T07:45:09Z","timestamp":1737099909000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10825051\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,15]]},"references-count":48,"URL":"https:\/\/doi.org\/10.1109\/bigdata62323.2024.10825051","relation":{},"subject":[],"published":{"date-parts":[[2024,12,15]]}}}