{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,14]],"date-time":"2026-03-14T09:57:16Z","timestamp":1773482236644,"version":"3.50.1"},"reference-count":82,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2025,12,2]],"date-time":"2025-12-02T00:00:00Z","timestamp":1764633600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,12,2]],"date-time":"2025-12-02T00:00:00Z","timestamp":1764633600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"name":"Key Basic Research Foundation of Shenzhen under Grant","award":["805 JCYJ20220818100205012"],"award-info":[{"award-number":["805 JCYJ20220818100205012"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Data Sci Anal"],"published-print":{"date-parts":[[2026,6]]},"DOI":"10.1007\/s41060-025-00916-7","type":"journal-article","created":{"date-parts":[[2025,12,2]],"date-time":"2025-12-02T07:27:05Z","timestamp":1764660425000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["A survey of approximate big data computing with the random sample partition (RSP)"],"prefix":"10.1007","volume":"21","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-6750-003X","authenticated-orcid":false,"given":"Salman","family":"Salloum","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9315-4057","authenticated-orcid":false,"given":"Kian-Lee","family":"Tan","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6797-2571","authenticated-orcid":false,"given":"Joshua Zhexue","family":"Huang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,12,2]]},"reference":[{"key":"916_CR1","doi-asserted-by":"publisher","unstructured":"Ma, S., Huai, J.: Approximate computation for big data analytics. SIGWEB Newsl. 2021(Winter) (2021) https:\/\/doi.org\/10.1145\/3447879.3447883","DOI":"10.1145\/3447879.3447883"},{"issue":"11","key":"916_CR2","doi-asserted-by":"publisher","first-page":"1583","DOI":"10.14778\/3342263.3342635","volume":"12","author":"AB Siddique","year":"2019","unstructured":"Siddique, A.B., Eldawy, A., Hristidis, V.: Comparing synopsis techniques for approximate spatial data analysis. Proc. VLDB Endow. 12(11), 1583\u20131596 (2019). https:\/\/doi.org\/10.14778\/3342263.3342635","journal-title":"Proc. VLDB Endow."},{"issue":"4","key":"916_CR3","doi-asserted-by":"publisher","first-page":"379","DOI":"10.1007\/s41019-018-0074-4","volume":"3","author":"K Li","year":"2018","unstructured":"Li, K., Li, G.: Approximate query processing: what is new and where to go? Data Sci. Eng. 3(4), 379\u2013397 (2018). https:\/\/doi.org\/10.1007\/s41019-018-0074-4","journal-title":"Data Sci. Eng."},{"key":"916_CR4","unstructured":"Hu, G., Zhang, D., Rigo, S., Nguyen, T.D.: Approximation with error bounds in spark. CoRR abs\/1812.01823 (2018) arXiv:1812.01823"},{"key":"916_CR5","doi-asserted-by":"publisher","unstructured":"Goiri, I., Bianchini, R., Nagarakatte, S., Nguyen, T.D.: Approxhadoop: Bringing approximations to mapreduce frameworks. In: Proceedings of the Twentieth International Conference on Architectural Support for Programming Languages and Operating Systems. ASPLOS \u201915, pp. 383\u2013397 (2015). https:\/\/doi.org\/10.1145\/2694344.2694351","DOI":"10.1145\/2694344.2694351"},{"issue":"2","key":"916_CR6","doi-asserted-by":"publisher","first-page":"85","DOI":"10.26599\/BDMA.2019.9020015","volume":"3","author":"MS Mahmud","year":"2020","unstructured":"Mahmud, M.S., Huang, J.Z., Salloum, S., Emara, T.Z., Sadatdiynov, K.: A survey of data partitioning and sampling methods to support big data analysis. Big Data Min. Anal. 3(2), 85\u2013101 (2020)","journal-title":"Big Data Min. Anal."},{"key":"916_CR7","doi-asserted-by":"publisher","unstructured":"Salloum, S., Huang, J.Z., He, Y.: Random sample partition: A distributed data model for big data analysis. IEEE Trans. Ind. Inform., 1\u20131 (2019) https:\/\/doi.org\/10.1109\/TII.2019.2912723","DOI":"10.1109\/TII.2019.2912723"},{"issue":"2","key":"916_CR8","doi-asserted-by":"publisher","first-page":"154","DOI":"10.26599\/BDMA.2022.9020014","volume":"6","author":"X Sun","year":"2023","unstructured":"Sun, X., He, Y., Wu, D., Huang, J.Z.: Survey of distributed computing frameworks for supporting big data analysis. Big Data Min. Anal. 6(2), 154\u2013169 (2023). https:\/\/doi.org\/10.26599\/BDMA.2022.9020014","journal-title":"Big Data Min. Anal."},{"key":"916_CR9","doi-asserted-by":"crossref","unstructured":"Wu, X., Liu, X., Dai, S.: The reliability of big data. In: 2014 IEEE 7th Joint International Information Technology and Artificial Intelligence Conference, pp. 295\u2013299 (2014)","DOI":"10.1109\/ITAIC.2014.7065054"},{"key":"916_CR10","doi-asserted-by":"crossref","unstructured":"Papakonstantinou, P.A., Woodruff, D.P., Yang, G.: True randomness from big data. Sci. Rep.6 (2016)","DOI":"10.1038\/srep33740"},{"key":"916_CR11","doi-asserted-by":"publisher","first-page":"28","DOI":"10.1016\/j.bdr.2017.07.003","volume":"9","author":"R Genuer","year":"2017","unstructured":"Genuer, R., Poggi, J.-M., Tuleau-Malot, C., Villa-Vialaneix, N.: Random forests for big data. Big Data Res. 9, 28\u201346 (2017). https:\/\/doi.org\/10.1016\/j.bdr.2017.07.003","journal-title":"Big Data Res."},{"key":"916_CR12","doi-asserted-by":"publisher","first-page":"3675","DOI":"10.1109\/ACCESS.2018.2889355","volume":"7","author":"S Salloum","year":"2019","unstructured":"Salloum, S., Huang, J.Z., He, Y., Chen, X.: An asymptotic ensemble learning framework for big data analysis. IEEE Access 7, 3675\u20133693 (2019). https:\/\/doi.org\/10.1109\/ACCESS.2018.2889355","journal-title":"IEEE Access"},{"key":"916_CR13","doi-asserted-by":"publisher","unstructured":"Salloum, S., Huang, J.Z., He, Y.: Empirical analysis of asymptotic ensemble learning for big data. In: Proceedings of the 3rd IEEE\/ACM International Conference on Big Data Computing, Applications and Technologies. BDCAT \u201916, pp. 8\u201317. Association for Computing Machinery, New York, NY, USA (2016). https:\/\/doi.org\/10.1145\/3006299.3006306","DOI":"10.1145\/3006299.3006306"},{"key":"916_CR14","doi-asserted-by":"publisher","unstructured":"Li, M.J., Cai, W., Lin, Y., Huang, S., Huang, J.Z., Peng, P.X.: RSP-gcForest: A Distributed Deep Forest via Random Sample Partition. In: Proceedings - 2023 IEEE International Conference on Big Data, BigData 2023, 710\u2013717 (2023) https:\/\/doi.org\/10.1109\/BIGDATA59044.2023.10386844","DOI":"10.1109\/BIGDATA59044.2023.10386844"},{"key":"916_CR15","doi-asserted-by":"publisher","DOI":"10.1016\/J.PATCOG.2024.111321","volume":"161","author":"MS Mahmud","year":"2025","unstructured":"Mahmud, M.S., Zheng, H., Garcia-Gil, D., Garc\u00eda, S., Huang, J.Z.: RSPCA: random sample partition and clustering approximation for ensemble learning of big data. Pattern Recognit. 161, 111321 (2025). https:\/\/doi.org\/10.1016\/J.PATCOG.2024.111321","journal-title":"Pattern Recognit."},{"key":"916_CR16","doi-asserted-by":"publisher","unstructured":"Du, X., He, Y., Huang, J.Z.: Random Sample Partition-Based Clustering Ensemble Algorithm for Big Data. In: Proceedings - 2021 IEEE International Conference on Big Data, Big Data 2021, 5885\u20135887 (2021) https:\/\/doi.org\/10.1109\/BIGDATA52589.2021.9671297","DOI":"10.1109\/BIGDATA52589.2021.9671297"},{"issue":"1","key":"916_CR17","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1186\/S40537-023-00709-4\/TABLES\/7","volume":"10","author":"MS Mahmud","year":"2023","unstructured":"Mahmud, M.S., Huang, J.Z., Ruby, R., Wu, K.: An ensemble method for estimating the number of clusters in a big data set using multiple random samples. J. Big Data 10(1), 1\u201333 (2023). https:\/\/doi.org\/10.1186\/S40537-023-00709-4\/TABLES\/7","journal-title":"J. Big Data"},{"key":"916_CR18","doi-asserted-by":"publisher","DOI":"10.1016\/J.INFFUS.2023.101986","volume":"101","author":"MS Mahmud","year":"2024","unstructured":"Mahmud, M.S., Huang, J.Z., Garc\u00eda, S.: Clustering approximation via a fusion of multiple random samples. Inform. Fus. 101, 101986 (2024). https:\/\/doi.org\/10.1016\/J.INFFUS.2023.101986","journal-title":"Inform. Fus."},{"key":"916_CR19","doi-asserted-by":"publisher","DOI":"10.1016\/J.INS.2024.121314","volume":"686","author":"Y Cai","year":"2025","unstructured":"Cai, Y., Mahmud, M.S., Xu, J., Sun, X., Huang, J.Z.: Spectral ensemble clustering with doubly stochastic co-association matrix. Inform. Sci. 686, 121314 (2025). https:\/\/doi.org\/10.1016\/J.INS.2024.121314","journal-title":"Inform. Sci."},{"key":"916_CR20","doi-asserted-by":"publisher","unstructured":"Pan, X., Deng, J., Yang, H., Peng, J., Yin, J.: Dpspc: A density peak-based statistical parallel clustering algorithm for big data. In: Knowledge Science, Engineering and Management: 17th International Conference, KSEM 2024, Birmingham, UK, August 16 18, 2024, Proceedings, Part II, pp. 292\u2013304. Springer, Berlin, Heidelberg (2024). https:\/\/doi.org\/10.1007\/978-981-97-5495-3_22","DOI":"10.1007\/978-981-97-5495-3_22"},{"key":"916_CR21","first-page":"15","volume":"00","author":"T Valiullin","year":"2020","unstructured":"Valiullin, T., Huang, Z.J., Wei, C., Yin, J., Wu, D., Egorova, L.: A new approximate method for mining frequent itemsets from big data. Comput. Sci. Inf. Syst. 00, 15\u201315 (2020)","journal-title":"Comput. Sci. Inf. Syst."},{"issue":"2","key":"916_CR22","doi-asserted-by":"publisher","DOI":"10.1016\/J.IPM.2023.103577","volume":"61","author":"X Sun","year":"2024","unstructured":"Sun, X., Ngueilbaye, A., Luo, K., Cai, Y., Wu, D., Huang, J.Z.: A scalable and flexible basket analysis system for big transaction data in Spark. Inform. Process. Manage. 61(2), 103577 (2024). https:\/\/doi.org\/10.1016\/J.IPM.2023.103577","journal-title":"Inform. Process. Manage."},{"issue":"1","key":"916_CR23","doi-asserted-by":"publisher","first-page":"45","DOI":"10.1186\/s40537-019-0205-4","volume":"6","author":"S Salloum","year":"2019","unstructured":"Salloum, S., Huang, J.Z., He, Y.: Exploring and cleaning big data with random sample data blocks. J. Big Data 6(1), 45 (2019)","journal-title":"J. Big Data"},{"issue":"1","key":"916_CR24","doi-asserted-by":"publisher","first-page":"104","DOI":"10.1145\/2688072","volume":"58","author":"R Nair","year":"2014","unstructured":"Nair, R.: Big data needs approximate computing: technical perspective. Commun. ACM 58(1), 104\u2013104 (2014). https:\/\/doi.org\/10.1145\/2688072","journal-title":"Commun. ACM"},{"issue":"1","key":"916_CR25","doi-asserted-by":"publisher","first-page":"100","DOI":"10.1109\/MCG.2017.6","volume":"37","author":"BC Kwon","year":"2017","unstructured":"Kwon, B.C., Verma, J., Haas, P.J., Demiralp, C.: Sampling for scalable visual analytics. IEEE Comput. Graphics Appl. 37(1), 100\u2013108 (2017). https:\/\/doi.org\/10.1109\/MCG.2017.6","journal-title":"IEEE Comput. Graphics Appl."},{"key":"916_CR26","doi-asserted-by":"crossref","unstructured":"Riondato, M.: Sampling-based data mining algorithms: Modern techniques and case studies. In: Proceedings of the 2014th European Conference on Machine Learning and Knowledge Discovery in Databases - Volume Part III. ECMLPKDD\u201914, pp. 516\u2013519. Springer, Berlin, Heidelberg (2014)","DOI":"10.1007\/978-3-662-44845-8_48"},{"issue":"3","key":"916_CR27","first-page":"59","volume":"38","author":"S Krishnan","year":"2015","unstructured":"Krishnan, S., Wang, J., Franklin, M.J., Goldberg, K., Kraska, T., Milo, T., Wu, E.: Sampleclean: fast and reliable analytics on dirty data. IEEE Data Eng. Bull. 38(3), 59\u201375 (2015)","journal-title":"IEEE Data Eng. Bull."},{"key":"916_CR28","doi-asserted-by":"publisher","unstructured":"Sutton, C.A., Hobson, T., Geddes, J., Caruana, R.: Data diff: Interpretable, executable summaries of changes in distributions for data wrangling. In: Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining, KDD 2018, London, UK, August 19-23, 2018, pp. 2279\u20132288 (2018). https:\/\/doi.org\/10.1145\/3219819.3220057","DOI":"10.1145\/3219819.3220057"},{"key":"916_CR29","doi-asserted-by":"publisher","unstructured":"Chu, X., Ilyas, I.F., Krishnan, S., Wang, J.: Data cleaning: Overview and emerging challenges. In: Proceedings of the 2016 International Conference on Management of Data. SIGMOD \u201916, pp. 2201\u20132206. ACM, New York, NY, USA (2016). https:\/\/doi.org\/10.1145\/2882903.2912574","DOI":"10.1145\/2882903.2912574"},{"issue":"12","key":"916_CR30","doi-asserted-by":"publisher","first-page":"948","DOI":"10.14778\/2994509.2994514","volume":"9","author":"S Krishnan","year":"2016","unstructured":"Krishnan, S., Wang, J., Wu, E., Franklin, M.J., Goldberg, K.: Activeclean: interactive data cleaning for statistical modeling. Proc. VLDB Endow. 9(12), 948\u2013959 (2016). https:\/\/doi.org\/10.14778\/2994509.2994514","journal-title":"Proc. VLDB Endow."},{"key":"916_CR31","doi-asserted-by":"publisher","unstructured":"Sun, X., Zhao, L., Chen, J., Cai, Y., Wu, D., Huang, J.Z.: Non-MapReduce computing for intelligent big data analysis. Eng. Appl. Artif. Intell. 129 (2024) https:\/\/doi.org\/10.1016\/j.engappai.2023.107648","DOI":"10.1016\/j.engappai.2023.107648"},{"key":"916_CR32","doi-asserted-by":"publisher","unstructured":"Sun, X., Wu, D., Cai, Y., Zhao, L., Xiao, C., Huang, J.Z.: Mapreduce vs non-mapreduce - efficiency and scalability in big data computing (2023) https:\/\/doi.org\/10.18699\/sblai2023-42","DOI":"10.18699\/sblai2023-42"},{"key":"916_CR33","doi-asserted-by":"publisher","DOI":"10.2139\/SSRN.4566617","author":"X Sun","year":"2023","unstructured":"Sun, X., He, Y., Huang, P.J.: Logo: a novel distributed computing framework for big data analytics. Journal (2023). https:\/\/doi.org\/10.2139\/SSRN.4566617","journal-title":"Journal"},{"key":"916_CR34","doi-asserted-by":"publisher","unstructured":"Zhanxiong, L., Xudong, S., Yonda, C., Yuming, Z., Langjie, M., Yulin, H., Zhexue, H.: A New Distributed Machine Learning Library Developed Based on LOGO Computing Framework. J. Integrat. Technol. (2024) https:\/\/doi.org\/10.12146\/J.ISSN.2095-3135.20240224001","DOI":"10.12146\/J.ISSN.2095-3135.20240224001"},{"key":"916_CR35","doi-asserted-by":"publisher","unstructured":"Shvachko, K., Kuang, H., Radia, S., Chansler, R.: The hadoop distributed file system. In: 2010 IEEE 26th Symposium on Mass Storage Systems and Technologies (MSST), pp. 1\u201310 (2010). https:\/\/doi.org\/10.1109\/MSST.2010.5496972","DOI":"10.1109\/MSST.2010.5496972"},{"key":"916_CR36","doi-asserted-by":"publisher","first-page":"105","DOI":"10.1016\/j.jss.2018.11.007","volume":"148","author":"TZ Emara","year":"2019","unstructured":"Emara, T.Z., Huang, J.Z.: A distributed data management system to support large-scale data analysis. J. Syst. Softw. 148, 105\u2013115 (2019). https:\/\/doi.org\/10.1016\/j.jss.2018.11.007","journal-title":"J. Syst. Softw."},{"key":"916_CR37","doi-asserted-by":"publisher","DOI":"10.1016\/j.scico.2019.102301","volume":"184","author":"TZ Emara","year":"2019","unstructured":"Emara, T.Z., Huang, J.Z.: Rrplib: a spark library for representing hdfs blocks as a set of random sample data blocks. Sci. Comput. Program. 184, 102301 (2019). https:\/\/doi.org\/10.1016\/j.scico.2019.102301","journal-title":"Sci. Comput. Program."},{"key":"916_CR38","doi-asserted-by":"publisher","first-page":"347","DOI":"10.1007\/978-3-319-94295-7_24","volume-title":"Cloud Comput.- CLOUD 2018","author":"C Wei","year":"2018","unstructured":"Wei, C., Salloum, S., Emara, T.Z., Zhang, X., Huang, J.Z., He, Y.: A two-stage data processing algorithm to generate random sample partitions for big data analysis. In: Luo, M., Zhang, L.-J. (eds.) Cloud Comput.- CLOUD 2018, pp. 347\u2013364. Springer, Cham (2018)"},{"issue":"4","key":"916_CR39","doi-asserted-by":"publisher","DOI":"10.1016\/J.IPM.2024.103746","volume":"61","author":"Y Cai","year":"2024","unstructured":"Cai, Y., Wu, D., Sun, X., Wu, S., Xu, J., Huang, J.Z.: CDFRS: a scalable sampling approach for efficient big data analysis. Inform. Process. Manage. 61(4), 103746 (2024). https:\/\/doi.org\/10.1016\/J.IPM.2024.103746","journal-title":"Inform. Process. Manage."},{"issue":"5","key":"916_CR40","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1007\/S11704-023-2356-X\/METRICS","volume":"18","author":"Y He","year":"2024","unstructured":"He, Y., Chen, J., Shen, J., Fournier-Viger, P., Huang, J.Z.: Density estimation-based method to determine sample size for random sample partition of big data. Front. Comp. Sci. 18(5), 1\u201314 (2024). https:\/\/doi.org\/10.1007\/S11704-023-2356-X\/METRICS","journal-title":"Front. Comp. Sci."},{"key":"916_CR41","doi-asserted-by":"publisher","unstructured":"Yang, H., Pan, X., Deng, J., Yin, J.: An effective rsp data sampling algorithm. In: Knowledge Science, Engineering and Management: 17th International Conference, KSEM 2024, Birmingham, UK, August 16 18, 2024, Proceedings, Part IV, pp. 331\u2013342. Springer, Berlin, Heidelberg (2024). https:\/\/doi.org\/10.1007\/978-981-97-5501-1_25","DOI":"10.1007\/978-981-97-5501-1_25"},{"key":"916_CR42","first-page":"723","volume":"13","author":"A Gretton","year":"2012","unstructured":"Gretton, A., Borgwardt, K.M., Rasch, M.J., Sch\u00f6lkopf, B., Smola, A.: A kernel two-sample test. J. Mach. Learn. Res. 13, 723\u2013773 (2012)","journal-title":"J. Mach. Learn. Res."},{"issue":"1","key":"916_CR43","doi-asserted-by":"publisher","first-page":"182","DOI":"10.1109\/TAI.2022.3151724","volume":"4","author":"Y He","year":"2023","unstructured":"He, Y., Ye, X., Huang, D., Fournier-Viger, P., Huang, J.Z.: A hybrid method to measure distribution consistency of mixed-attribute datasets. IEEE Trans. Artif. Intell. 4(1), 182\u2013196 (2023). https:\/\/doi.org\/10.1109\/TAI.2022.3151724","journal-title":"IEEE Trans. Artif. Intell."},{"key":"916_CR44","doi-asserted-by":"publisher","first-page":"448","DOI":"10.1007\/978-3-030-60245-1_31","volume-title":"Algorithms Architect. Parall. Process.","author":"C Wei","year":"2020","unstructured":"Wei, C., Zhang, J., Valiullin, T., Cao, W., Wang, Q., Long, H.: Distributed and parallel ensemble classification for big data based on Kullback-Leibler random sample partition. In: Qiu, M. (ed.) Algorithms Architect. Parall. Process., pp. 448\u2013464. Springer, Cham (2020)"},{"issue":"4","key":"916_CR45","doi-asserted-by":"publisher","first-page":"1142","DOI":"10.1109\/TBDATA.2023.3255003","volume":"9","author":"MS Mahmud","year":"2023","unstructured":"Mahmud, M.S., Huang, J.Z., Ruby, R., Ngueilbaye, A., Wu, K.: Approximate clustering ensemble method for big data. IEEE Trans. Big Data 9(4), 1142\u20131155 (2023). https:\/\/doi.org\/10.1109\/TBDATA.2023.3255003","journal-title":"IEEE Trans. Big Data"},{"key":"916_CR46","doi-asserted-by":"publisher","first-page":"641","DOI":"10.2298\/CSIS200124015V","volume":"18","author":"T Valiullin","year":"2021","unstructured":"Valiullin, T., Huang, J.Z., Wei, C., Yin, J., Wu, D., Egorova, I.: A new approximate method for mining frequent itemsets from big data. Comput. Sci. Inf. Syst. 18, 641\u2013656 (2021)","journal-title":"Comput. Sci. Inf. Syst."},{"key":"916_CR47","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2023.3324911","author":"L Xie","year":"2023","unstructured":"Xie, L., Wang, T., Du, S., Cai, H.: CERT-DF: a computing-efficient and robust distributed deep forest framework with low communication overhead. IEEE Trans. Parallel Distrib. Syst. (2023). https:\/\/doi.org\/10.1109\/TPDS.2023.3324911","journal-title":"IEEE Trans. Parallel Distrib. Syst."},{"key":"916_CR48","doi-asserted-by":"publisher","unstructured":"Salloum, S., Huang, J.Z.: Rsp-hist: Approximate histograms for big data exploration on hadoop clusters. In: 2021 IEEE 28th International Conference on High Performance Computing, Data, and Analytics (HiPC), pp. 412\u2013417 (2021). https:\/\/doi.org\/10.1109\/HiPC53243.2021.00058","DOI":"10.1109\/HiPC53243.2021.00058"},{"key":"916_CR49","first-page":"329","volume-title":"Adv. Intell. Comput. Technol. Appl.","author":"H Huang","year":"2025","unstructured":"Huang, H., Lu, X.: A novel random sample partition-based ensemble algorithm for credit card fraud detection. In: Huang, D.-S., Pan, Y., Chen, W., Li, B. (eds.) Adv. Intell. Comput. Technol. Appl., pp. 329\u2013340. Springer, Singapore (2025)"},{"key":"916_CR50","doi-asserted-by":"publisher","unstructured":"Singh, T., Khanna, R., Satakshi, Kumar, M.: Multiclass imbalanced big data classification utilizing spark cluster. In: 2021 12th International Conference on Computing Communication and Networking Technologies (ICCCNT), pp. 1\u20137 (2021). https:\/\/doi.org\/10.1109\/ICCCNT51525.2021.9580029","DOI":"10.1109\/ICCCNT51525.2021.9580029"},{"key":"916_CR51","doi-asserted-by":"publisher","first-page":"26663","DOI":"10.1007\/s11042-023-16624-y","volume":"83","author":"T Trinh","year":"2024","unstructured":"Trinh, T., HoangAnh, L., Nhung, V., Hai, H., KieuAnh, V.: A novel ensemble-based paradigm to process large-scale data. Multimed. Tools Appl. 83, 26663\u201326685 (2024). https:\/\/doi.org\/10.1007\/s11042-023-16624-y","journal-title":"Multimed. Tools Appl."},{"key":"916_CR52","doi-asserted-by":"publisher","unstructured":"Cai, W., Li, M.J.: An interval rsp-based ensemble model for big data analysis. In: International Distributed Multimedia Systems Conference on Visualization and Visual Languages (2023). https:\/\/doi.org\/10.18293\/DMSVIVA2023-089 . https:\/\/api.semanticscholar.org\/CorpusID:261559281","DOI":"10.18293\/DMSVIVA2023-089"},{"issue":"2","key":"916_CR53","doi-asserted-by":"publisher","first-page":"248","DOI":"10.1109\/TFUZZ.2014.2310734","volume":"23","author":"C Wagner","year":"2015","unstructured":"Wagner, C., Miller, S., Garibaldi, J.M., Anderson, D.T., Havens, T.C.: From interval-valued data to general type-2 fuzzy sets. IEEE Trans. Fuzzy Syst. 23(2), 248\u2013269 (2015). https:\/\/doi.org\/10.1109\/TFUZZ.2014.2310734","journal-title":"IEEE Trans. Fuzzy Syst."},{"key":"916_CR54","doi-asserted-by":"publisher","first-page":"177","DOI":"10.1016\/J.INS.2020.09.068","volume":"548","author":"Y He","year":"2021","unstructured":"He, Y., Wu, Y., Qin, H., Huang, J.Z., Jin, Y.: Improved I-nice clustering algorithm based on density peaks mechanism. Inform. Sci. 548, 177\u2013190 (2021). https:\/\/doi.org\/10.1016\/J.INS.2020.09.068","journal-title":"Inform. Sci."},{"issue":"6191","key":"916_CR55","doi-asserted-by":"publisher","first-page":"1492","DOI":"10.1126\/science.1242072","volume":"344","author":"A Rodriguez","year":"2014","unstructured":"Rodriguez, A., Laio, A.: Clustering by fast search and find of density peaks. Science 344(6191), 1492\u20131496 (2014). https:\/\/doi.org\/10.1126\/science.1242072","journal-title":"Science"},{"key":"916_CR56","doi-asserted-by":"publisher","first-page":"13","DOI":"10.1016\/j.ins.2018.01.013","volume":"436\u2013437","author":"Y-A Geng","year":"2018","unstructured":"Geng, Y.-A., Li, Q., Zheng, R., Zhuang, F., He, R., Xiong, N.: Recome: a new density-based clustering algorithm using relative knn kernel density. Inform. Sci. 436\u2013437, 13\u201330 (2018). https:\/\/doi.org\/10.1016\/j.ins.2018.01.013","journal-title":"Inform. Sci."},{"issue":"8","key":"916_CR57","doi-asserted-by":"publisher","first-page":"1799","DOI":"10.1109\/TPDS.2020.2975550","volume":"31","author":"Y-A Geng","year":"2020","unstructured":"Geng, Y.-A., Li, Q., Liang, M., Chi, C.-Y., Tan, J., Huang, H.: Local-density subspace distributed clustering for high-dimensional data. IEEE Trans. Parallel Distrib. Syst. 31(8), 1799\u20131814 (2020). https:\/\/doi.org\/10.1109\/TPDS.2020.2975550","journal-title":"IEEE Trans. Parallel Distrib. Syst."},{"issue":"8","key":"916_CR58","doi-asserted-by":"publisher","first-page":"3714","DOI":"10.1109\/TKDE.2020.3034611","volume":"34","author":"J Lu","year":"2022","unstructured":"Lu, J., Zhao, Y., Tan, K.-L., Wang, Z.: Distributed density peaks clustering revisited. IEEE Trans. Knowl. Data Eng. 34(8), 3714\u20133726 (2022). https:\/\/doi.org\/10.1109\/TKDE.2020.3034611","journal-title":"IEEE Trans. Knowl. Data Eng."},{"key":"916_CR59","unstructured":"Zaharia, M., Chowdhury, M., Franklin, M.J., Shenker, S., Stoica, I.: Spark: cluster computing with working sets, 10 (2010)"},{"issue":"11","key":"916_CR60","doi-asserted-by":"publisher","first-page":"56","DOI":"10.1145\/2934664","volume":"59","author":"M Zaharia","year":"2016","unstructured":"Zaharia, M., Xin, R.S., Wendell, P., Das, T., Armbrust, M., Dave, A., Meng, X., Rosen, J., Venkataraman, S., Franklin, M.J., Ghodsi, A., Gonzalez, J., Shenker, S., Stoica, I.: Apache spark: a unified engine for big data processing. Commun. ACM 59(11), 56\u201365 (2016). https:\/\/doi.org\/10.1145\/2934664","journal-title":"Commun. ACM"},{"issue":"3","key":"916_CR61","doi-asserted-by":"publisher","first-page":"145","DOI":"10.1007\/s41060-016-0027-9","volume":"1","author":"S Salloum","year":"2016","unstructured":"Salloum, S., Dautov, R., Chen, X., Peng, P.X., Huang, J.Z.: Big data analytics on apache spark. Int. J. Data Sci. Anal. 1(3), 145\u2013164 (2016). https:\/\/doi.org\/10.1007\/s41060-016-0027-9","journal-title":"Int. J. Data Sci. Anal."},{"key":"916_CR62","doi-asserted-by":"publisher","unstructured":"Zhou, Z.H., Feng, J.: Deep forest: Towards an alternative to deep neural networks. IJCAI Int. Joint Conf. Artif. Intell. 0, 3553\u20133559 (2017) https:\/\/doi.org\/10.24963\/IJCAI.2017\/497","DOI":"10.24963\/IJCAI.2017\/497"},{"issue":"11","key":"916_CR63","doi-asserted-by":"publisher","first-page":"3141","DOI":"10.1109\/TPDS.2021.3133544","volume":"33","author":"Z Chen","year":"2022","unstructured":"Chen, Z., Wang, T., Cai, H., Mondal, S.K., Sahoo, J.P.: Blb-gcforest: a high-performance distributed deep forest with adaptive sub-forest splitting. IEEE Trans. Parallel Distrib. Syst. 33(11), 3141\u20133152 (2022). https:\/\/doi.org\/10.1109\/TPDS.2021.3133544","journal-title":"IEEE Trans. Parallel Distrib. Syst."},{"key":"916_CR64","doi-asserted-by":"publisher","first-page":"113","DOI":"10.1016\/J.JPDC.2019.05.001","volume":"132","author":"G Zhu","year":"2019","unstructured":"Zhu, G., Hu, Q., Gu, R., Yuan, C., Huang, Y.: ForestLayer: efficient training of deep forests on distributed task-parallel platforms. J. Parall. Distribut. Comput. 132, 113\u2013126 (2019). https:\/\/doi.org\/10.1016\/J.JPDC.2019.05.001","journal-title":"J. Parall. Distribut. Comput."},{"key":"916_CR65","unstructured":"Gibbons, P.B., Matias, Y., Poosala, V.: Fast incremental maintenance of approximate histograms, pp. 466\u2013475 (1997)"},{"key":"916_CR66","doi-asserted-by":"publisher","unstructured":"Shi, Y., Meng, X., Wang, F., Gan, Y.: Hedc: A histogram estimator for data in the cloud. In: Proceedings of the Fourth International Workshop on Cloud Data Management. CloudDB \u201912, pp. 51\u201358. ACM, New York, NY, USA (2012). https:\/\/doi.org\/10.1145\/2390021.2390032","DOI":"10.1145\/2390021.2390032"},{"key":"916_CR67","doi-asserted-by":"crossref","unstructured":"Salloum, S., Wu, Y., Huang, J.Z.: A sampling-based system for approximate big data analysis on computing clusters. In: Proceedings of the 28th ACM International Conference on Information and Knowledge Management, pp. 2481\u20132484 (2019)","DOI":"10.1145\/3357384.3358124"},{"key":"916_CR68","doi-asserted-by":"publisher","unstructured":"Rocklin: Dask: Parallel Computation with Blocked algorithms and Task Scheduling. In: Huff, Bergstra (eds.) Proceedings of the 14th Python in Science Conference, pp. 126\u2013132 (2015). https:\/\/doi.org\/10.25080\/Majora-7b98e3ed-013","DOI":"10.25080\/Majora-7b98e3ed-013"},{"key":"916_CR69","doi-asserted-by":"publisher","unstructured":"Armbrust, M., Ghodsi, A., Zaharia, M., Xin, R.S., Lian, C., Huai, Y., Liu, D., Bradley, J.K., Meng, X., Kaftan, T., Franklin, M.J.: Spark SQL. In: Proceedings of the 2015 ACM SIGMOD International Conference on Management of Data - SIGMOD \u201915, pp. 1383\u20131394. ACM Press, New York, New York, USA (2015). https:\/\/doi.org\/10.1145\/2723372.2742797 . http:\/\/dl.acm.org\/citation.cfm?id=2723372.2742797","DOI":"10.1145\/2723372.2742797"},{"key":"916_CR70","doi-asserted-by":"publisher","first-page":"178526","DOI":"10.1109\/ACCESS.2020.3027675","volume":"8","author":"TZ Emara","year":"2020","unstructured":"Emara, T.Z., Huang, J.Z.: Distributed data strategies to support large-scale data analysis across geo-distributed data centers. IEEE Access 8, 178526\u2013178538 (2020). https:\/\/doi.org\/10.1109\/ACCESS.2020.3027675","journal-title":"IEEE Access"},{"issue":"1","key":"916_CR71","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1038\/s41598-023-44789-x","volume":"13","author":"TZ Emara","year":"2023","unstructured":"Emara, T.Z., Trinh, T., Huang, J.Z.: Geographically distributed data management to support large-scale data analysis. Sci. Rep. 13(1), 1\u201310 (2023). https:\/\/doi.org\/10.1038\/s41598-023-44789-x","journal-title":"Sci. Rep."},{"key":"916_CR72","doi-asserted-by":"publisher","unstructured":"Dong, R., Cao, X., Chao, K., Wang, C., Xu, L., Xing, Z., He, M., Cheng, X.: Survey of computing power network evolution. In: cProceedings of the 2024 4th International Conference on Signal Processing and Communication Technology. SPCT \u201924, pp. 248\u2013253. Association for Computing Machinery, New York, NY, USA (2025). https:\/\/doi.org\/10.1145\/3712464.3712509","DOI":"10.1145\/3712464.3712509"},{"issue":"9","key":"916_CR73","doi-asserted-by":"publisher","first-page":"109","DOI":"10.23919\/JCC.ja.2021-0776","volume":"21","author":"S Yukun","year":"2024","unstructured":"Yukun, S., Bo, L., Junlin, L., Haonan, H., Xing, Z., Jing, P., Wenbo, W.: Computing power network: a survey. China Commun. 21(9), 109\u2013145 (2024). https:\/\/doi.org\/10.23919\/JCC.ja.2021-0776","journal-title":"China Commun."},{"key":"916_CR74","doi-asserted-by":"publisher","unstructured":"Zhang, N., Duan, H., Guan, Y., Mao, R., Song, G., Yang, J., Shan, Y.: The eastern data and western computing initiative in China contributes to its net-zero target. Engineering (2024). https:\/\/doi.org\/10.1016\/j.eng.2024.08.010","DOI":"10.1016\/j.eng.2024.08.010"},{"key":"916_CR75","doi-asserted-by":"publisher","DOI":"10.1038\/s41598-025-16624-y","author":"T Ting","year":"2025","unstructured":"Ting, T., Li, M.: Enhanced secure storage and data privacy management system for big data based on multilayer model. Sci. Rep. (2025). https:\/\/doi.org\/10.1038\/s41598-025-16624-y","journal-title":"Sci. Rep."},{"key":"916_CR76","unstructured":"Mustafa, R., Mahmud, M.S., Shadid, M.: Random sampling method of large-scale graph data classification. Jurnal Kejuruteraan (2024)"},{"key":"916_CR77","doi-asserted-by":"crossref","unstructured":"Baumgartner, J., Zannettou, S., Keegan, B., Squire, M., Blackburn, J.: The pushshift reddit dataset. CoRR abs\/2001.08435 (2020) arXiv:2001.08435","DOI":"10.1609\/icwsm.v14i1.7347"},{"key":"916_CR78","doi-asserted-by":"publisher","unstructured":"Cleveland, W.S., Hafen, R.: Divide and recombine (d &r): Data science for large complex data. Stat. Anal. Data Min. ASA Data Sci. J. 7(6), 425\u2013433 https:\/\/doi.org\/10.1002\/sam.11242","DOI":"10.1002\/sam.11242"},{"issue":"1","key":"916_CR79","doi-asserted-by":"publisher","first-page":"57","DOI":"10.1080\/09332480.2018.1438711","volume":"31","author":"N Lazar","year":"2018","unstructured":"Lazar, N.: The big picture: divide and combine to conquer big data. Chance 31(1), 57\u201359 (2018). https:\/\/doi.org\/10.1080\/09332480.2018.1438711","journal-title":"Chance"},{"key":"916_CR80","doi-asserted-by":"publisher","first-page":"46","DOI":"10.1016\/j.neucom.2019.01.037","volume":"337","author":"Q Meng","year":"2019","unstructured":"Meng, Q., Chen, W., Wang, Y., Ma, Z.-M., Liu, T.-Y.: Convergence analysis of distributed stochastic gradient descent with shuffling. Neurocomputing 337, 46\u201357 (2019). https:\/\/doi.org\/10.1016\/j.neucom.2019.01.037","journal-title":"Neurocomputing"},{"issue":"5","key":"916_CR81","doi-asserted-by":"publisher","first-page":"1231","DOI":"10.1007\/s00778-024-00845-0","volume":"33","author":"L Xu","year":"2024","unstructured":"Xu, L., Qiu, S., Yuan, B., Jiang, J., Renggli, C., Gan, S., Kara, K., Li, G., Liu, J., Wu, W., Ye, J., Zhang, C.: Stochastic gradient descent without full data shuffle: with applications to in-database machine learning and deep learning systems. VLDB J. 33(5), 1231\u20131255 (2024). https:\/\/doi.org\/10.1007\/s00778-024-00845-0","journal-title":"VLDB J."},{"issue":"12","key":"916_CR82","doi-asserted-by":"publisher","first-page":"3649","DOI":"10.14778\/3611540.3611554","volume":"16","author":"C Br\u00fccke","year":"2023","unstructured":"Br\u00fccke, C., H\u00e4rtling, P., Palacios, R.D.E., Patel, H., Rabl, T.: Tpcx-ai - an industry standard benchmark for artificial intelligence and machine learning systems. Proc. VLDB Endow. 16(12), 3649\u20133661 (2023). https:\/\/doi.org\/10.14778\/3611540.3611554","journal-title":"Proc. VLDB Endow."}],"container-title":["International Journal of Data Science and Analytics"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s41060-025-00916-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s41060-025-00916-7","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s41060-025-00916-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,14]],"date-time":"2026-03-14T09:36:49Z","timestamp":1773481009000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s41060-025-00916-7"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,12,2]]},"references-count":82,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2026,6]]}},"alternative-id":["916"],"URL":"https:\/\/doi.org\/10.1007\/s41060-025-00916-7","relation":{},"ISSN":["2364-415X","2364-4168"],"issn-type":[{"value":"2364-415X","type":"print"},{"value":"2364-4168","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,12,2]]},"assertion":[{"value":"2 June 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"6 October 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"2 December 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"One of the co-authors, Prof. Joshua Zhexue Huang, is on the editorial board of the International Journal of Data Science and Analytics.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}],"article-number":"35"}}