{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,2,21]],"date-time":"2025-02-21T21:37:53Z","timestamp":1740173873785,"version":"3.37.3"},"reference-count":40,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2019,5,18]],"date-time":"2019-05-18T00:00:00Z","timestamp":1558137600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0"},{"start":{"date-parts":[[2019,5,18]],"date-time":"2019-05-18T00:00:00Z","timestamp":1558137600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["J Big Data"],"published-print":{"date-parts":[[2019,12]]},"DOI":"10.1186\/s40537-019-0204-5","type":"journal-article","created":{"date-parts":[[2019,5,18]],"date-time":"2019-05-18T05:59:44Z","timestamp":1558159184000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":3,"title":["Diftong: a tool for validating big data workflows"],"prefix":"10.1186","volume":"6","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-1150-3503","authenticated-orcid":false,"given":"Raya","family":"Rizk","sequence":"first","affiliation":[]},{"given":"Steve","family":"McKeever","sequence":"additional","affiliation":[]},{"given":"Johan","family":"Petrini","sequence":"additional","affiliation":[]},{"given":"Erik","family":"Zeitler","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2019,5,18]]},"reference":[{"key":"204_CR1","doi-asserted-by":"publisher","unstructured":"Li N, Escalona A, Guo Y, Offutt J. A scalable big data test framework. In: 2015 IEEE 8th international conference on software testing, verification and validation (ICST); 2015. p. 1\u20132. \n                    https:\/\/doi.org\/10.1109\/ICST.2015.7102619\n                    \n                  .","DOI":"10.1109\/ICST.2015.7102619"},{"key":"204_CR2","doi-asserted-by":"publisher","DOI":"10.5334\/dsj-2015-002","author":"L Cai","year":"2015","unstructured":"Cai L, Zhu Y. The challenges of data quality and data quality assessment in the big bata era. Data Sci J. 2015;. \n                    https:\/\/doi.org\/10.5334\/dsj-2015-002\n                    \n                  .","journal-title":"Data Sci J"},{"key":"204_CR3","unstructured":"Sadiq S, Orlowska M, Sadiq W, Foulger C. Data flow and validation in workflow modelling. In: Proceedings of the 15th Australasian database conference, Vol. 27. ADC \u201904, p. 207\u2013214. Australian Computer Society, Inc., Darlinghurst, Australia, Australia; 2004. \n                    http:\/\/dl.acm.org\/citation.cfm?id=1012294.1012317\n                    \n                  ."},{"key":"204_CR4","doi-asserted-by":"publisher","unstructured":"Taleb I, Dssouli R, Serhani MA. Big data pre-processing: a quality framework. In: 2015 IEEE international congress on big data; 2015. p. 191\u2013198. \n                    https:\/\/doi.org\/10.1109\/BigDataCongress.2015.35\n                    \n                  .","DOI":"10.1109\/BigDataCongress.2015.35"},{"key":"204_CR5","doi-asserted-by":"publisher","unstructured":"Gao J, Xie C, Tao C. Big data validation and quality assurance\u2013issuses, challenges, and needs. In: 2016 IEEE symposium on service-oriented system engineering (SOSE); 2016. p. 433\u2013441. \n                    https:\/\/doi.org\/10.1109\/SOSE.2016.63\n                    \n                  .","DOI":"10.1109\/SOSE.2016.63"},{"key":"204_CR6","unstructured":"Experian data quality: the well-oiled data machine; 2014. \n                    https:\/\/www.edq.com\/uk\/blog\/the-well-oiled-data-machine\n                    \n                  . Accessed 5 May 2018."},{"key":"204_CR7","unstructured":"Moore S. How to create a business case for data quality improvement; 2017. \n                    https:\/\/www.gartner.com\/smarterwithgartner\/how-to-create-a-business-case-for-data-quality-improvement\n                    \n                  . Accessed 12 May 2018."},{"key":"204_CR8","unstructured":"IBM. Extracting business value from the 4 V\u2019s of big data; 2016. \n                    http:\/\/www.ibmbigdatahub.com\/infographic\/extracting-business-value-4-vs-big-data\n                    \n                  . Accessed 12 May 2018."},{"key":"204_CR9","doi-asserted-by":"publisher","unstructured":"Xie C, Gao J, Tao C. Big data validation case study. 2017 IEEE third international conference on big data computing service and applications (BigDataService). 2017; p. 281\u2013286. \n                    https:\/\/doi.org\/10.1109\/bigdataservice.2017.44\n                    \n                  .","DOI":"10.1109\/bigdataservice.2017.44"},{"key":"204_CR10","doi-asserted-by":"publisher","first-page":"940","DOI":"10.1016\/j.procs.2016.05.285","volume":"85","author":"N Garg","year":"2016","unstructured":"Garg N, Singla S, Jangra S. Challenges and techniques for testing of big data. Procedia Comput Sci. 2016;85:940\u20138. \n                    https:\/\/doi.org\/10.1016\/j.procs.2016.05.285\n                    \n                   International Conference on Computational Modelling and Security (CMS 2016).","journal-title":"Procedia Comput Sci"},{"key":"204_CR11","unstructured":"Redman TC. Data\u2019s credibility problem; 2013.\n                    https:\/\/enterprisersproject.com\/sites\/default\/files\/Data\u2019sCredibilityProblem.pdf\n                    \n                  ."},{"key":"204_CR12","doi-asserted-by":"publisher","unstructured":"Palazzo C, Mariello A, Fiore S, D\u2019Anca A, Elia D, Williams DN, Aloisio G. A workflow-enabled big data analytics software stack for eScience. In: 2015 International conference on high performance computing simulation (HPCS); 2015. p. 545\u2013552. \n                    https:\/\/doi.org\/10.1109\/HPCSim.2015.7237088\n                    \n                  .","DOI":"10.1109\/HPCSim.2015.7237088"},{"key":"204_CR13","doi-asserted-by":"publisher","unstructured":"Ordonez C, Garc\u00eda-Garc\u00eda J. Managing big data analytics workflows with a database system. In: 2016 16th IEEE\/ACM international symposium on cluster, cloud and grid computing (CCGrid); 2016. p. 649\u2013655. \n                    https:\/\/doi.org\/10.1109\/CCGrid.2016.63\n                    \n                  .","DOI":"10.1109\/CCGrid.2016.63"},{"issue":"1","key":"204_CR14","doi-asserted-by":"publisher","first-page":"14","DOI":"10.1186\/s13173-017-0063-x","volume":"23","author":"N Laranjeiro","year":"2017","unstructured":"Laranjeiro N, Soydemir SN, Ivaki N, Bernardino J. Testing data-centric services using poor quality data: from relational to NoSQL document databases. J Braz Comput Soc. 2017;23(1):14. \n                    https:\/\/doi.org\/10.1186\/s13173-017-0063-x\n                    \n                  .","journal-title":"J Braz Comput Soc"},{"key":"204_CR15","unstructured":"Klarna: About us; 2018. \n                    https:\/\/www.klarna.com\/se\/om-oss\n                    \n                  . Accessed 29 Jan 2018."},{"issue":"1","key":"204_CR16","doi-asserted-by":"publisher","first-page":"6","DOI":"10.1007\/s41019-015-0004-7","volume":"1","author":"D Firmani","year":"2016","unstructured":"Firmani D, Mecella M, Scannapieco M, Batini C. On the meaningfulness of \u201cbig data quality\u201d (invited paper). Data Sci Eng. 2016;1(1):6\u201320. \n                    https:\/\/doi.org\/10.1007\/s41019-015-0004-7\n                    \n                  .","journal-title":"Data Sci Eng"},{"key":"204_CR17","doi-asserted-by":"publisher","first-page":"159","DOI":"10.1007\/978-3-319-98398-1_11","volume-title":"Advances in databases and information systems","author":"F Arolfo","year":"2018","unstructured":"Arolfo F, Vaisman A. Data quality in a big data context. In: Bencz\u00far A, Thalheim B, Horv\u00e1th T, editors. Advances in databases and information systems. Cham: Springer International Publishing; 2018. p. 159\u201372. \n                    https:\/\/doi.org\/10.1007\/978-3-319-98398-1_11\n                    \n                  ."},{"key":"204_CR18","unstructured":"RTTS: QuerySurge; 2018. \n                    http:\/\/www.querysurge.com\/solutions\/testing-big-data\n                    \n                  . Accessed 04 June 2018."},{"key":"204_CR19","unstructured":"Spotify: BigDiffy; 2018. \n                    https:\/\/github.com\/spotify\/ratatool\/tree\/master\/ratatool-diffy\n                    \n                  . Accessed 04 June 2018."},{"issue":"11","key":"204_CR20","doi-asserted-by":"publisher","first-page":"78","DOI":"10.14569\/IJACSA.2015.061111","volume":"6","author":"C Gyor\u00f6di","year":"2015","unstructured":"Gyor\u00f6di C, Gyor\u00f6di R, Sotoc R. A comparative study of relational and non-relational database models in a web- based application. Int J Adv Comput Sci Appl. 2015;6(11):78\u201383. \n                    https:\/\/doi.org\/10.14569\/IJACSA.2015.061111\n                    \n                  .","journal-title":"Int J Adv Comput Sci Appl"},{"issue":"1","key":"204_CR21","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1186\/s40537-014-0007-7","volume":"2","author":"MM Najafabadi","year":"2015","unstructured":"Najafabadi MM, Villanustre F, Khoshgoftaar TM, Seliya N, Wald R, Muharemagic E. Deep learning applications and challenges in big data analytics. J Big Data. 2015;2(1):1. \n                    https:\/\/doi.org\/10.1186\/s40537-014-0007-7\n                    \n                  .","journal-title":"J Big Data"},{"issue":"1","key":"204_CR22","doi-asserted-by":"publisher","first-page":"8","DOI":"10.1186\/s40537-017-0068-5","volume":"4","author":"NQ Mehmood","year":"2017","unstructured":"Mehmood NQ, Culmone R, Mostarda L. Modeling temporal aspects of sensor data for MongoDB NoSQL database. J Big Data. 2017;4(1):8. \n                    https:\/\/doi.org\/10.1186\/s40537-017-0068-5\n                    \n                  .","journal-title":"J Big Data"},{"key":"204_CR23","unstructured":"Geddam, S. Building a robust big data QA ecosystem to mitigate data integrity challenges; 2014. \n                    https:\/\/www.cognizant.com\/whitepapers\/building-a-robust-big-data-qa-ecosystem-to-mitigate-data-integrity-challenges-codex907.pdf\n                    \n                  . Accessed 12 May 2018."},{"key":"204_CR24","doi-asserted-by":"publisher","first-page":"21","DOI":"10.1007\/978-3-319-24106-7","volume-title":"Data and information quality: dimensions, principles and techniques","author":"C Batini","year":"2016","unstructured":"Batini C, Monica S. Data and information quality: dimensions, principles and techniques. Switzerland: Springer; 2016. p. 21\u201351. \n                    https:\/\/doi.org\/10.1007\/978-3-319-24106-7\n                    \n                  ."},{"key":"204_CR25","unstructured":"Nagdive AS, Tugnayat DRM, Tembhurkar MP. Overview on performance testing approach in big data. International Journal of Advanced Research in Computer Science. 2014;5(8):165\u2013169. \n                    https:\/\/www.researchgate.net\/publication\/270338528_Overview_on_Performance_Testing_Approach_in_Big_Data\n                    \n                  . Accessed 12 May 2018."},{"key":"204_CR26","unstructured":"Try QA: Big data testing; 2017. \n                    http:\/\/tryqa.com\/big-data-testing\/\n                    \n                  . Accessed 13 Aug 2018."},{"key":"204_CR27","doi-asserted-by":"publisher","unstructured":"Yassien AW, Desouky AF. RDBMS, NoSQL, Hadoop: a performance-based empirical analysis. In: Proceedings of the 2nd Africa and Middle East conference on software engineering. AMECSE \u201916, vol. 28-29, p. 52\u201359. ACM, New York, NY, USA; 2016. \n                    https:\/\/doi.org\/10.1145\/2944165.2944174\n                    \n                  .","DOI":"10.1145\/2944165.2944174"},{"issue":"1","key":"204_CR28","doi-asserted-by":"publisher","first-page":"36","DOI":"10.1186\/s40537-018-0146-3","volume":"5","author":"M Birjali","year":"2018","unstructured":"Birjali M, Beni-Hssane A, Erritali M. Evaluation of high-level query languages based on MapReduce in big data. J Big Data. 2018;5(1):36. \n                    https:\/\/doi.org\/10.1186\/s40537-018-0146-3\n                    \n                  .","journal-title":"J Big Data"},{"key":"204_CR29","doi-asserted-by":"publisher","unstructured":"Thusoo A, Sarma JS, Jain N, Shao Z, Chakka P, Zhang N, Anthony S, Liu H, Murthy R. Hive\u2014a petabyte scale data warehouse using Hadoop. In: Proceedings of the 26th international conference on data engineering, ICDE 2010, p. 996\u20131005. IEEE, Long Beach, California, USA; 2010. \n                    https:\/\/doi.org\/10.1109\/ICDE.2010.5447738\n                    \n                  .","DOI":"10.1109\/ICDE.2010.5447738"},{"key":"204_CR30","doi-asserted-by":"publisher","DOI":"10.1007\/978-1-4842-0271-5","volume-title":"Practical Hive: a guide to Hadoop\u2019s data warehouse system","author":"S Shaw","year":"2016","unstructured":"Shaw S, Vermeulen AF, Gupta A, Kjerrumgaard D. Practical Hive: a guide to Hadoop\u2019s data warehouse system. 1st ed. Berkely, CA, USA: Apress; 2016.","edition":"1"},{"key":"204_CR31","unstructured":"Atlassian: Apache Hive configuration properties; 2018. \n                    https:\/\/cwiki.apache.org\/confluence\/display\/Hive\/Configuration+Properties\n                    \n                  . Accessed 15 Apr 2018."},{"key":"204_CR32","unstructured":"Atlassian: Apache Hive admin manual configuration; 2017. \n                    https:\/\/cwiki.apache.org\/confluence\/display\/Hive\/AdminManual+Configuration\n                    \n                  . Accessed 07 May 2018."},{"issue":"3","key":"204_CR33","doi-asserted-by":"publisher","first-page":"88","DOI":"10.1111\/test.12133","volume":"39","author":"J \u017derovnik","year":"2017","unstructured":"\u017derovnik J, Poklukar DR. Elementary methods for computation of quartiles. Teach Stat. 2017;39(3):88\u201391. \n                    https:\/\/doi.org\/10.1111\/test.12133\n                    \n                  .","journal-title":"Teach Stat"},{"key":"204_CR34","unstructured":"Atlassian: Apache Hive language manual; 2017. \n                    https:\/\/cwiki.apache.org\/confluence\/display\/Hive\/LanguageManual\n                    \n                  . Accessed 08 Apr 2018."},{"key":"204_CR35","doi-asserted-by":"publisher","DOI":"10.3390\/inventions3040071","author":"V Moulos","year":"2018","unstructured":"Moulos V, Chatzikyriakos G, Kassouras V, Doulamis A, Doulamis N, Leventakis G, Florakis T, Varvarigou T, Mitsokapas E, Kioumourtzis G, Klirodetis P, Psychas A, Marinakis A, Sfetsos T, Koniaris A, Liapis D, Gatzioura A. A robust information life cycle management framework for securing and governing critical infrastructure systems. Inventions. 2018;. \n                    https:\/\/doi.org\/10.3390\/inventions3040071\n                    \n                  .","journal-title":"Inventions"},{"key":"204_CR36","doi-asserted-by":"publisher","first-page":"70","DOI":"10.1007\/978-3-319-92898-2_6","volume-title":"Advanced information systems engineering workshops","author":"J Schneider","year":"2018","unstructured":"Schneider J, Handali JP, vom Brocke J. Increasing trust in (big) data analytics. In: Matulevi\u010dius R, Dijkman R, editors. Advanced information systems engineering workshops. Cham: Springer; 2018. p. 70\u201384."},{"key":"204_CR37","doi-asserted-by":"publisher","unstructured":"S\u00e4nger J, Richthammer C, Hassan S, Pernul, G. Trust and big data: a roadmap for research. In: 2014 25th international workshop on database and expert systems applications; 2014. p. 278\u2013282. \n                    https:\/\/doi.org\/10.1109\/DEXA.2014.63\n                    \n                  .","DOI":"10.1109\/DEXA.2014.63"},{"key":"204_CR38","unstructured":"TPC: TPC Benchmark$$^{{\\rm TM}}$$ H standard specification revision 2.17.3, San Francisco. Transaction processing performance council (TPC); 1993\u20132017. \n                    http:\/\/www.tpc.org\/tpc_documents_current_versions\/pdf\/tpc-h_v2.17.3.pdf\n                    \n                  ."},{"key":"204_CR39","unstructured":"Hortonworks: Hive TestBench; 2018. \n                    https:\/\/github.com\/hortonworks\/hive-testbench\n                    \n                  . Accessed 16 May 2018."},{"issue":"11","key":"204_CR40","doi-asserted-by":"publisher","first-page":"864","DOI":"10.14778\/2983200.2983203","volume":"9","author":"X Chu","year":"2016","unstructured":"Chu X, Ilyas IF, Koutris P. Distributed data deduplication. Proc VLDB Endowment. 2016;9(11):864\u201375. \n                    https:\/\/doi.org\/10.14778\/2983200.2983203\n                    \n                  .","journal-title":"Proc VLDB Endowment"}],"container-title":["Journal of Big Data"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1186\/s40537-019-0204-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1186\/s40537-019-0204-5\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1186\/s40537-019-0204-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2020,5,16]],"date-time":"2020-05-16T23:04:29Z","timestamp":1589670269000},"score":1,"resource":{"primary":{"URL":"https:\/\/journalofbigdata.springeropen.com\/articles\/10.1186\/s40537-019-0204-5"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2019,5,18]]},"references-count":40,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2019,12]]}},"alternative-id":["204"],"URL":"https:\/\/doi.org\/10.1186\/s40537-019-0204-5","relation":{},"ISSN":["2196-1115"],"issn-type":[{"type":"electronic","value":"2196-1115"}],"subject":[],"published":{"date-parts":[[2019,5,18]]},"assertion":[{"value":"16 February 2019","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"6 May 2019","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"18 May 2019","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"The authors declare that they have no competing interests.","order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Competing interests"}}],"article-number":"41"}}