{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,27]],"date-time":"2026-02-27T04:31:06Z","timestamp":1772166666740,"version":"3.50.1"},"reference-count":31,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2025,7,26]],"date-time":"2025-07-26T00:00:00Z","timestamp":1753488000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0"},{"start":{"date-parts":[[2025,7,26]],"date-time":"2025-07-26T00:00:00Z","timestamp":1753488000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["J Big Data"],"DOI":"10.1186\/s40537-025-01209-3","type":"journal-article","created":{"date-parts":[[2025,7,26]],"date-time":"2025-07-26T07:44:59Z","timestamp":1753515899000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["An LLM-guided platform for multi-granular collection and management of data provenance"],"prefix":"10.1186","volume":"12","author":[{"given":"Luca","family":"Gregori","sequence":"first","affiliation":[]},{"given":"Pasquale Leonardo","family":"Lazzaro","sequence":"additional","affiliation":[]},{"given":"Marialaura","family":"Lazzaro","sequence":"additional","affiliation":[]},{"given":"Paolo","family":"Missier","sequence":"additional","affiliation":[]},{"given":"Riccardo","family":"Torlone","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,7,26]]},"reference":[{"key":"1209_CR1","doi-asserted-by":"publisher","first-page":"52138","DOI":"10.1109\/ACCESS.2018.2870052","volume":"6","author":"A Adadi","year":"2018","unstructured":"Adadi A, Berrada M. Peeking inside the black-box: a survey on explainable artificial intelligence (xai). IEEE Access. 2018;6:52138\u201360. https:\/\/doi.org\/10.1109\/ACCESS.2018.2870052.","journal-title":"IEEE Access"},{"key":"1209_CR2","unstructured":"Sundararajan M, Najmi A. The many shapley values for model explanation. In: Proceedings of the 37th International Conference on Machine Learning. Proceedings of Machine Learning Research, 2020;119:9269\u20139278 . https:\/\/proceedings.mlr.press\/v119\/sundararajan20b.html"},{"key":"1209_CR3","doi-asserted-by":"publisher","unstructured":"Jacovi A, Marasovi\u0107 A, Miller T, Goldberg Y. Formalizing trust in artificial intelligence: Prerequisites, causes and goals of human trust in ai. In: Proceedings of the 2021 ACM Conference on Fairness, Accountability, and Transparency. FAccT \u201921, pp. 624\u2013635. Association for Computing Machinery, New York, NY, USA 2021. https:\/\/doi.org\/10.1145\/3442188.3445923","DOI":"10.1145\/3442188.3445923"},{"key":"1209_CR4","doi-asserted-by":"publisher","DOI":"10.3390\/data6020011","author":"E Alshdaifat","year":"2021","unstructured":"Alshdaifat E, Alshdaifat D, Alsarhan A, Hussein F, El-Salhi SMFS. The effect of preprocessing techniques, applied to numeric features, on classification algorithms\u2019 performance. Data. 2021. https:\/\/doi.org\/10.3390\/data6020011.","journal-title":"Data"},{"key":"1209_CR5","unstructured":"Neutatz F, Chen B, Abedjan Z, Wu E, Berlin T. From Cleaning before ML to Cleaning for ML."},{"issue":"3\u20134","key":"1209_CR6","doi-asserted-by":"publisher","first-page":"209","DOI":"10.1561\/1900000068","volume":"9","author":"B Glavic","year":"2021","unstructured":"Glavic B. Data provenance. Found Trends Databases. 2021;9(3\u20134):209\u2013441.","journal-title":"Found Trends Databases"},{"key":"1209_CR7","unstructured":"Moreau L, Missier P, Belhajjame K, B\u2019Far R, Cheney J, Coppens S, et al. Prov-dm: The prov data model. w3c 2013."},{"key":"1209_CR8","doi-asserted-by":"crossref","unstructured":"Gregori L, Missier P, Stidolph M, Torlone r, Wood A. Design and Development of a Provenance Capture Platform for Data Science. In: Procs. 3rd DATAPLAT Workshop, Co-located with ICDE 2024. IEEE, Utrecht, NL 2024.","DOI":"10.1109\/ICDEW61823.2024.00042"},{"issue":"12","key":"1209_CR9","doi-asserted-by":"publisher","first-page":"3614","DOI":"10.14778\/3554821.3554857","volume":"15","author":"A Chapman","year":"2022","unstructured":"Chapman A, Missier P, Lauro L, Torlone R. DPDS: assisting data science with data provenance. PVLDB. 2022;15(12):3614\u20137. https:\/\/doi.org\/10.14778\/3554821.3554857.","journal-title":"PVLDB"},{"key":"1209_CR10","doi-asserted-by":"crossref","unstructured":"Smith MJ, Sala C, Kanter JM, Veeramachaneni K. The machine learning bazaar: Harnessing the ml ecosystem for effective system development. In: Proceedings of the 2020 ACM SIGMOD International Conference on Management of Data. SIGMOD \u201920. ACM, New York, NY, USA 2020","DOI":"10.1145\/3318464.3386146"},{"issue":"1","key":"1209_CR11","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1007\/s10115-011-0463-8","volume":"33","author":"F Kamiran","year":"2012","unstructured":"Kamiran F, Calders T. Data preprocessing techniques for classification without discrimination. Knowl Inform Syst. 2012;33(1):1\u201333.","journal-title":"Knowl Inform Syst"},{"key":"1209_CR12","doi-asserted-by":"crossref","unstructured":"Niu X, Kapoor R, Glavic B, Gawlick D, Liu ZH, Radhakrishnan V. Provenance-aware query optimization. In: 33rd IEEE International Conference on Data Engineering, ICDE 2017, San Diego, CA, USA, April 19-22, 2017, 2017;473\u2013484","DOI":"10.1109\/ICDE.2017.104"},{"issue":"1","key":"1209_CR13","first-page":"51","volume":"41","author":"BS Arab","year":"2018","unstructured":"Arab BS, Feng S, Glavic B, Lee S, Niu X, Zeng Q. Gprom-a swiss army knife for your provenance needs. IEEE Data Eng Bull. 2018;41(1):51\u201362.","journal-title":"IEEE Data Eng Bull"},{"key":"1209_CR14","doi-asserted-by":"crossref","unstructured":"Glavic B, Alonso G. Perm: Processing provenance and data on the same data model through query rewriting. In: Ioannidis, Y.E., Lee, D.L., Ng, R.T. (eds.) Proceedings of the 25th International Conference on Data Engineering, ICDE 2009, March 29 2009 - April 2 2009, Shanghai, China, 2009;174\u2013185","DOI":"10.1109\/ICDE.2009.15"},{"key":"1209_CR15","doi-asserted-by":"crossref","unstructured":"Lee S, K\u00f6hler S, Lud\u00e4scher B, Glavic B. A SQL-Middleware Unifying Why and Why-Not Provenance for First-Order Queries. In: 33rd IEEE International Conference on Data Engineering, ICDE 2017, San Diego, CA, USA, April 19-22, 2017, 2017;485\u2013496","DOI":"10.1109\/ICDE.2017.105"},{"issue":"3","key":"1209_CR16","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3311955","volume":"52","author":"JF Pimentel","year":"2019","unstructured":"Pimentel JF, Freire J, Murta L, Braganholo V. A survey on collecting, managing, and analyzing provenance from scripts. ACM Comput Surv (CSUR). 2019;52(3):1\u201338.","journal-title":"ACM Comput Surv (CSUR)"},{"key":"1209_CR17","doi-asserted-by":"publisher","unstructured":"Namaki MH, Floratou A, Psallidas F, Krishnan S, Agrawal A, Wu Y, et al. Vamsa: Automated provenance tracking in data science scripts. In: Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining. KDD \u201920, pp. 1542\u20131551. Association for Computing Machinery, New York, NY, USA 2020. https:\/\/doi.org\/10.1145\/3394486.3403205","DOI":"10.1145\/3394486.3403205"},{"key":"1209_CR18","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1007\/s00778-021-00726-w","volume":"31","author":"S Grafberger","year":"2022","unstructured":"Grafberger S, Groth P, Stoyanovich J, Schelter S. Data distribution debugging in machine learning pipelines. VLDB J. 2022;31:1\u201324.","journal-title":"VLDB J"},{"key":"1209_CR19","doi-asserted-by":"crossref","unstructured":"Pimentel JF, Freire J, Murta L, Braganholo V. Fine-grained provenance collection over scripts through program slicing. In: International Provenance and Annotation Workshop, 2016;199\u2013203.","DOI":"10.1007\/978-3-319-40593-3_21"},{"issue":"12","key":"1209_CR20","doi-asserted-by":"publisher","first-page":"1841","DOI":"10.14778\/3137765.3137789","volume":"10","author":"JF Pimentel","year":"2017","unstructured":"Pimentel JF, Murta L, Braganholo V, Freire J. noworkflow: a tool for collecting, analyzing, and managing provenance from python scripts. Proc VLDB Endow. 2017;10(12):1841\u20134.","journal-title":"Proc VLDB Endow"},{"key":"1209_CR21","doi-asserted-by":"crossref","unstructured":"McPhillips TM, Song T, Kolisnik T, Aulenbach S, Belhajjame K, Bocinsky K, et al. A user-oriented, language-independent tool for recovering workflow information from scripts. CoRR. 2015. abs\/1502.02403.","DOI":"10.2218\/ijdc.v10i1.370"},{"key":"1209_CR22","doi-asserted-by":"crossref","unstructured":"Zhang Q, Cao Y, Wang Q, Vu D, Thavasimani P, McPhillips T. et al. Revealing the Detailed Lineage of Script Outputs using Hybrid Provenance. In: Procs. 11th Intl. Digital Curation Conference (IDCC) 2017.","DOI":"10.2218\/ijdc.v12i2.585"},{"issue":"12","key":"1209_CR23","doi-asserted-by":"publisher","first-page":"3354","DOI":"10.14778\/3415478.3415556","volume":"13","author":"L Rupprecht","year":"2020","unstructured":"Rupprecht L, Davis JC, Arnold C, Gur Y, Bhagwat D. Improving reproducibility of data science pipelines through transparent provenance capture. Proc VLDB Endow. 2020;13(12):3354\u201368. https:\/\/doi.org\/10.14778\/3415478.3415556.","journal-title":"Proc VLDB Endow"},{"issue":"11","key":"1209_CR24","doi-asserted-by":"publisher","first-page":"2033","DOI":"10.14778\/3407790.3407807","volume":"13","author":"D Petersohn","year":"2020","unstructured":"Petersohn D, Ma WW, Lee DJL, Macke S, Xin D, Mo X, Gonzalez J, Hellerstein JM, Joseph AD, Parameswaran AG. Towards scalable dataframe systems. Proc VLDB Endow. 2020;13(11):2033\u201346.","journal-title":"Proc VLDB Endow"},{"issue":"12","key":"1209_CR25","doi-asserted-by":"publisher","first-page":"3614","DOI":"10.14778\/3554821.3554857","volume":"15","author":"A Chapman","year":"2022","unstructured":"Chapman A, Lauro L, Missier P, Torlone R. Dpds: assisting data science with data provenance. Proc VLDB Endow. 2022;15(12):3614\u20137.","journal-title":"Proc VLDB Endow"},{"key":"1209_CR26","doi-asserted-by":"publisher","unstructured":"FAIRsharing Community: FAIRsharing: C5QG88 (2023). https:\/\/doi.org\/10.24432\/C5QG88.","DOI":"10.24432\/C5QG88."},{"key":"1209_CR27","doi-asserted-by":"publisher","unstructured":"Kohavi R. Census Income 1996. https:\/\/doi.org\/10.24432\/C5GP7S.","DOI":"10.24432\/C5GP7S."},{"key":"1209_CR28","unstructured":"Pfisterer F, Siyi W, Lang M. COMPAS Dataset in mlr3fairness 2023. https:\/\/mlr3fairness.mlr-org.com\/reference\/compas.html."},{"key":"1209_CR29","unstructured":"Volk A. Dataset of USED CARS 2023. https:\/\/www.kaggle.com\/datasets\/volkanastasia\/dataset-of-used-cars."},{"key":"1209_CR30","unstructured":"Dua D, Graff C. UCI Machine Learning Repository: Mushroom Data Set 2019. https:\/\/archive.ics.uci.edu\/dataset\/73\/mushroom."},{"key":"1209_CR31","unstructured":"Kaggle: Titanic - Machine Learning from Disaster 2025. https:\/\/www.kaggle.com\/competitions\/titanic\/data."}],"container-title":["Journal of Big Data"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1186\/s40537-025-01209-3.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1186\/s40537-025-01209-3\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1186\/s40537-025-01209-3.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,8]],"date-time":"2025-09-08T01:29:49Z","timestamp":1757294989000},"score":1,"resource":{"primary":{"URL":"https:\/\/journalofbigdata.springeropen.com\/articles\/10.1186\/s40537-025-01209-3"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,7,26]]},"references-count":31,"journal-issue":{"issue":"1","published-online":{"date-parts":[[2025,12]]}},"alternative-id":["1209"],"URL":"https:\/\/doi.org\/10.1186\/s40537-025-01209-3","relation":{"has-preprint":[{"id-type":"doi","id":"10.21203\/rs.3.rs-5369549\/v1","asserted-by":"object"}]},"ISSN":["2196-1115"],"issn-type":[{"value":"2196-1115","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,7,26]]},"assertion":[{"value":"1 November 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"22 May 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"26 July 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no competing interests.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Competing interests"}}],"article-number":"187"}}