{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,30]],"date-time":"2026-01-30T00:05:54Z","timestamp":1769731554782,"version":"3.49.0"},"reference-count":47,"publisher":"Springer Science and Business Media LLC","issue":"5","license":[{"start":{"date-parts":[[2017,8,7]],"date-time":"2017-08-07T00:00:00Z","timestamp":1502064000000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2017,8,7]],"date-time":"2017-08-07T00:00:00Z","timestamp":1502064000000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"funder":[{"name":"National Science Foundation","award":["IIS-1302698"],"award-info":[{"award-number":["IIS-1302698"]}]},{"name":"National Science Foundation","award":["CNS-1351047"],"award-info":[{"award-number":["CNS-1351047"]}]},{"DOI":"10.13039\/100000070","name":"National Institute of Biomedical Imaging and Bioengineering","doi-asserted-by":"publisher","award":["U54EB020404"],"award-info":[{"award-number":["U54EB020404"]}],"id":[{"id":"10.13039\/100000070","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["The VLDB Journal"],"published-print":{"date-parts":[[2018,10]]},"DOI":"10.1007\/s00778-017-0474-5","type":"journal-article","created":{"date-parts":[[2017,8,7]],"date-time":"2017-08-07T07:37:05Z","timestamp":1502091425000},"page":"595-615","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":24,"title":["Adding data provenance support to Apache Spark"],"prefix":"10.1007","volume":"27","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-5756-8321","authenticated-orcid":false,"given":"Matteo","family":"Interlandi","sequence":"first","affiliation":[]},{"given":"Ari","family":"Ekmekji","sequence":"additional","affiliation":[]},{"given":"Kshitij","family":"Shah","sequence":"additional","affiliation":[]},{"given":"Muhammad Ali","family":"Gulzar","sequence":"additional","affiliation":[]},{"given":"Sai Deep","family":"Tetali","sequence":"additional","affiliation":[]},{"given":"Miryung","family":"Kim","sequence":"additional","affiliation":[]},{"given":"Todd","family":"Millstein","sequence":"additional","affiliation":[]},{"given":"Tyson","family":"Condie","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2017,8,7]]},"reference":[{"key":"474_CR1","doi-asserted-by":"crossref","unstructured":"Alvaro, P., Rosen, J., Hellerstein, J.M.: Lineage-driven fault injection. In: SIGMOD, pp. 331\u2013346 (2015)","DOI":"10.1145\/2723372.2723711"},{"issue":"4","key":"474_CR2","first-page":"346","volume":"5","author":"Y Amsterdamer","year":"2011","unstructured":"Amsterdamer, Y., Davidson, S.B., Deutch, D., Milo, T., Stoyanovich, J., Tannen, V.: Putting lipstick on pig: enabling database-style workflow provenance. VLDB 5(4), 346\u2013357 (2011)","journal-title":"VLDB"},{"key":"474_CR3","doi-asserted-by":"crossref","unstructured":"Anand, M.K., Bowers, S., Lud\u00e4scher, B.: Techniques for efficiently querying scientific workflow provenance graphs. In: EDBT, pp. 287\u2013298 (2010)","DOI":"10.1145\/1739041.1739078"},{"key":"474_CR4","doi-asserted-by":"crossref","unstructured":"Armbrust, M., Xin, R.S., Lian, C., Huai, Y., Liu, D., Bradley, J.K., Meng, X., Kaftan, T., Franklin, M.J., Ghodsi, A., Zaharia, M.: Spark SQL: relational data processing in spark. In: SIGMOD, pp. 1383\u20131394 (2015)","DOI":"10.1145\/2723372.2742797"},{"key":"474_CR5","unstructured":"Asterixdb. https:\/\/asterixdb.apache.org\/"},{"key":"474_CR6","unstructured":"Bigdebug. sites.google.com\/site\/sparkbigdebug\/"},{"key":"474_CR7","doi-asserted-by":"crossref","unstructured":"Biton, O., Cohen-Boulakia, S., Davidson, S.B., Hara, C.S.: Querying and managing provenance through user views in scientific workflows. In: ICDE, pp. 1072\u20131081 (2008)","DOI":"10.1109\/ICDE.2008.4497516"},{"key":"474_CR8","doi-asserted-by":"crossref","unstructured":"Borkar, V., Carey, M., Grover, R., Onose, N., Vernica, R.: Hyracks: a flexible and extensible foundation for data-intensive computing. In: ICDE, pp. 1151\u20131162 (2011)","DOI":"10.1109\/ICDE.2011.5767921"},{"issue":"5","key":"474_CR9","doi-asserted-by":"publisher","first-page":"709","DOI":"10.1002\/spe.2325","volume":"46","author":"S Chambi","year":"2016","unstructured":"Chambi, S., Lemire, D., Kaser, O., Godin, R.: Better bitmap performance with roaring bitmaps. Softw. Pract. Exp. 46(5), 709\u2013719 (2016)","journal-title":"Softw. Pract. Exp."},{"issue":"12","key":"474_CR10","doi-asserted-by":"publisher","first-page":"1137","DOI":"10.14778\/2994509.2994530","volume":"9","author":"Z Chothia","year":"2016","unstructured":"Chothia, Z., Liagouris, J., McSherry, F., Roscoe, T.: Explaining outputs in modern data analytics. Proc. VLDB Endow. 9(12), 1137\u20131148 (2016)","journal-title":"Proc. VLDB Endow."},{"issue":"1","key":"474_CR11","doi-asserted-by":"publisher","first-page":"41","DOI":"10.1007\/s00778-002-0083-8","volume":"12","author":"Y Cui","year":"2003","unstructured":"Cui, Y., Widom, J.: Lineage tracing for general data warehouse transformations. VLDBJ 12(1), 41\u201358 (2003)","journal-title":"VLDBJ"},{"key":"474_CR12","unstructured":"Dave, A., Zaharia, M., Shenker, S., Stoica, I.: Arthur: Rich post-facto debugging for production analytics applications. Tech. Rep. (2013)"},{"key":"474_CR13","unstructured":"Flink. https:\/\/flink.apache.org\/"},{"key":"474_CR14","doi-asserted-by":"crossref","unstructured":"Glavic, B., Alonso, G.: Perm: Processing provenance and data on the same data model through query rewriting. In: ICDE, pp. 174\u2013185 (2009)","DOI":"10.1109\/ICDE.2009.15"},{"issue":"1","key":"474_CR15","first-page":"1314","volume":"3","author":"B Glavic","year":"2010","unstructured":"Glavic, B., Alonso, G., Miller, R.J., Haas, L.M.: TRAMP: understanding the behavior of schema mappings through provenance. PVLDB 3(1), 1314\u20131325 (2010)","journal-title":"PVLDB"},{"key":"474_CR16","unstructured":"Gonzalez, J.E., Xin, R.S., Dave, A., Crankshaw, D., Franklin, M.J., Stoica, I.: Graphx: graph processing in a distributed dataflow framework. In: OSDI, pp. 599\u2013613 (2014)"},{"key":"474_CR17","doi-asserted-by":"crossref","unstructured":"Graefe, G., McKenna, W.J.: The volcano optimizer generator: extensibility and efficient search. In: ICDE, pp. 209\u2013218 (1993)","DOI":"10.1109\/ICDE.1993.344061"},{"key":"474_CR18","unstructured":"Green, T.J., Karvounarakis, G., Ives, Z.G., Tannen, V.: Update exchange with mappings and provenance. In: Proceedings of the 33rd International Conference on Very Large Data Bases, VLDB \u201907, pp. 675\u2013686. VLDB Endowment (2007)"},{"key":"474_CR19","unstructured":"Gulzar, M.A., Han, X., Interlandi, M., Mardani, S., Tetali, S.D., Millstein, T., Kim, M.: Interactive debugging for big data analytics. In: 8th USENIX Workshop on Hot Topics in Cloud Computing (HotCloud 16). USENIX Association, Denver, CO (2016)"},{"key":"474_CR20","doi-asserted-by":"crossref","unstructured":"Gulzar, M.A., Han, M.I.X., Li, M., Condie, T., Kim, M.: Automated debugging in data-intensive scalable computing. In: Proceedings of the Seventh ACM Symposium on Cloud Computing, SoCC \u201917. ACM, New York (2017)","DOI":"10.1145\/3127479.3131624"},{"key":"474_CR21","doi-asserted-by":"crossref","unstructured":"Gulzar, M.A., Interlandi, M., Condie, T., Kim, M.: Bigdebug: interactive debugger for big data analytics in apache spark. In: FSE, pp. 1033\u20131037 (2016)","DOI":"10.1145\/2950290.2983930"},{"key":"474_CR22","doi-asserted-by":"crossref","unstructured":"Gulzar, M.A., Interlandi, M., Yoo, S., Tetali, S.D., Condie, T., Millstein, T., Kim, M.: Bigdebug: debugging primitives for interactive big data processing in spark. In: ICSE, pp. 784\u2013795 (2016)","DOI":"10.1145\/2884781.2884813"},{"key":"474_CR23","unstructured":"Hadoop. http:\/\/hadoop.apache.org"},{"key":"474_CR24","doi-asserted-by":"crossref","unstructured":"Heinis, T., Alonso, G.: Efficient lineage tracking for scientific workflows. In: SIGMOD, pp. 1007\u20131018 (2008)","DOI":"10.1145\/1376616.1376716"},{"key":"474_CR25","unstructured":"Ikeda, R., Park, H., Widom, J.: Provenance for generalized map and reduce workflows. In: CIDR, pp. 273\u2013283 (2011)"},{"key":"474_CR26","doi-asserted-by":"crossref","unstructured":"Interlandi, M., Tang, N.: Proof positive and negative in data cleaning. In: ICDE, pp. 18\u201329 (2015)","DOI":"10.1109\/ICDE.2015.7113269"},{"key":"474_CR27","doi-asserted-by":"crossref","unstructured":"Interlandi, M., Tetali, S.D., Gulzar, M.A., Noor, J., Condie, T., Kim, M., Millstein, T.: Optimizing interactive development of data-intensive applications. In: Proceedings of the Seventh ACM Symposium on Cloud Computing, SoCC \u201916, pp. 510\u2013522. ACM, New York, NY, USA (2016)","DOI":"10.1145\/2987550.2987565"},{"issue":"3","key":"474_CR28","first-page":"216","volume":"9","author":"M Interlandi","year":"2015","unstructured":"Interlandi, M., Shah, K., Tetali, S.D., Gulzar, M.A., Yoo, S., Kim, M., Millstein, T.D., Condie, T.: Titian: data provenance support in spark. PVLDB 9(3), 216\u2013227 (2015)","journal-title":"PVLDB"},{"key":"474_CR29","doi-asserted-by":"crossref","unstructured":"Karvounarakis, G., Ives, Z.G., Tannen, V.: Querying data provenance. In: Proceedings of the 2010 ACM SIGMOD International Conference on Management of Data, SIGMOD \u201910, pp. 951\u2013962. ACM, New York, NY, USA (2010)","DOI":"10.1145\/1807167.1807269"},{"key":"474_CR30","doi-asserted-by":"crossref","unstructured":"Karvounarakis, G., Ives, Z.G., Tannen, V.: Querying data provenance. In: SIGMOD, pp. 951\u2013962 (2010)","DOI":"10.1145\/1807167.1807269"},{"key":"474_CR31","doi-asserted-by":"crossref","unstructured":"Logothetis, D., De, S., Yocum, K.: Scalable lineage capture for debugging disc analytics. In: SOCC, pp. 17:1\u201317:15 (2013)","DOI":"10.1145\/2523616.2523619"},{"issue":"1","key":"474_CR32","first-page":"34","volume":"4","author":"A Meliou","year":"2010","unstructured":"Meliou, A., Gatterbauer, W., Moore, K.F., Suciu, D.: The complexity of causality and responsibility for query answers and non-answers. PVLDB 4(1), 34\u201345 (2010)","journal-title":"PVLDB"},{"key":"474_CR33","first-page":"17","volume-title":"Lecture Notes in Computer Science","author":"Paolo Missier","year":"2008","unstructured":"Missier, P., Belhajjame, K., Zhao, J., Roos, M., Goble, C.A.: Data lineage model for Taverna workflows with lightweight annotation requirements. In: IPAW, pp. 17\u201330 (2008)"},{"key":"474_CR34","unstructured":"Mllib. http:\/\/spark.apache.org\/mllib"},{"key":"474_CR35","doi-asserted-by":"crossref","unstructured":"Murray, D.G., McSherry, F., Isaacs, R., Isard, M., Barham, P., Abadi, M.: Naiad: a timely dataflow system. In: SOSP. ACM (2013)","DOI":"10.1145\/2517349.2522738"},{"key":"474_CR36","doi-asserted-by":"crossref","unstructured":"Olston, C., Reed, B., Srivastava, U., Kumar, R., Tomkins, A.: Pig latin: a not-so-foreign language for data processing. In: SIGMOD, pp. 1099\u20131110. ACM (2008)","DOI":"10.1145\/1376616.1376726"},{"issue":"12","key":"474_CR37","first-page":"1237","volume":"4","author":"C Olston","year":"2011","unstructured":"Olston, C., Reed, B.: Inspector gadget: a framework for custom monitoring and debugging of distributed dataflows. PVLDB 4(12), 1237\u20131248 (2011)","journal-title":"PVLDB"},{"key":"474_CR38","doi-asserted-by":"crossref","unstructured":"Roy, S., Suciu, D.: A formal approach to finding explanations for database queries. In: SIGMOD, pp. 1579\u20131590 (2014)","DOI":"10.1145\/2588555.2588578"},{"key":"474_CR39","unstructured":"Spark. http:\/\/spark.apache.org"},{"issue":"2","key":"474_CR40","first-page":"1626","volume":"2","author":"A Thusoo","year":"2009","unstructured":"Thusoo, A., Sarma, J.S., Jain, N., Shao, Z., Chakka, P., Anthony, S., Liu, H., Wyckoff, P., Murthy, R.: Hive: a warehousing solution over a map-reduce framework. VLDB 2(2), 1626\u20131629 (2009)","journal-title":"VLDB"},{"key":"474_CR41","doi-asserted-by":"crossref","unstructured":"Wang, L., Zhan, J., Luo, C., Zhu, Y., Yang, Q., He, Y., Gao, W., Jia, Z., Shi, Y., Zhang, S., Zheng, C., Lu, G., Zhan, K., Li, X., Qiu, B.: Bigdatabench: a big data benchmark suite from internet services. In HPCA, pp. 488\u2013499 (2014)","DOI":"10.1109\/HPCA.2014.6835958"},{"key":"474_CR42","doi-asserted-by":"crossref","unstructured":"Welsh, M., Culler, D., Brewer, E.: Seda: an architecture for well-conditioned, scalable internet services. In: SOSP, pp. 230\u2013243 (2001)","DOI":"10.1145\/502059.502057"},{"issue":"8","key":"474_CR43","doi-asserted-by":"publisher","first-page":"553","DOI":"10.14778\/2536354.2536356","volume":"6","author":"E Wu","year":"2013","unstructured":"Wu, E., Madden, S.: Scorpion: explaining away outliers in aggregate queries. Proc. VLDB Endow. 6(8), 553\u2013564 (2013)","journal-title":"Proc. VLDB Endow."},{"key":"474_CR44","unstructured":"Zaharia, M., Chowdhury, M., Das, T., Dave, A., Ma, J., McCauley, M., Franklin, M.J., Shenker, S., Stoica, I.: Resilient distributed datasets: a fault-tolerant abstraction for in-memory cluster computing. In: NSDI (2012)"},{"issue":"2","key":"474_CR45","first-page":"183","volume":"28","author":"A Zeller","year":"2002","unstructured":"Zeller, A., Hildebrandt, R.: Simplifying and isolating failure-inducing input. TSE 28(2), 183\u2013200 (2002)","journal-title":"TSE"},{"key":"474_CR46","doi-asserted-by":"crossref","unstructured":"Zhou, W., Fei, Q., Narayan, A., Haeberlen, A., Loo, B.T., Sherr, M.: Secure network provenance. In: SOSP, pp. 295\u2013310 (2011)","DOI":"10.1145\/2043556.2043584"},{"key":"474_CR47","doi-asserted-by":"crossref","unstructured":"Zhou, W., Sherr, M., Tao, T., Li, X., Loo, B.T., Mao, Y.: Efficient querying and maintenance of network provenance at internet-scale. In: SIGMOD, pp. 615\u2013626 (2010)","DOI":"10.1145\/1807167.1807234"}],"container-title":["The VLDB Journal"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s00778-017-0474-5\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s00778-017-0474-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s00778-017-0474-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,24]],"date-time":"2025-06-24T20:49:10Z","timestamp":1750798150000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s00778-017-0474-5"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2017,8,7]]},"references-count":47,"journal-issue":{"issue":"5","published-print":{"date-parts":[[2018,10]]}},"alternative-id":["474"],"URL":"https:\/\/doi.org\/10.1007\/s00778-017-0474-5","relation":{},"ISSN":["1066-8888","0949-877X"],"issn-type":[{"value":"1066-8888","type":"print"},{"value":"0949-877X","type":"electronic"}],"subject":[],"published":{"date-parts":[[2017,8,7]]},"assertion":[{"value":"15 January 2017","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"13 May 2017","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"24 July 2017","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"7 August 2017","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}