{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,4,30]],"date-time":"2025-04-30T04:19:42Z","timestamp":1745986782686,"version":"3.40.4"},"publisher-location":"Berlin, Heidelberg","reference-count":34,"publisher":"Springer Berlin Heidelberg","isbn-type":[{"type":"print","value":"9783642358661"},{"type":"electronic","value":"9783642358678"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2013]]},"DOI":"10.1007\/978-3-642-35867-8_3","type":"book-chapter","created":{"date-parts":[[2013,1,11]],"date-time":"2013-01-11T02:24:52Z","timestamp":1357871092000},"page":"36-55","source":"Crossref","is-referenced-by-count":8,"title":["Employing Checkpoint to Improve Job Scheduling in Large-Scale Systems"],"prefix":"10.1007","author":[{"given":"Shuangcheng","family":"Niu","sequence":"first","affiliation":[]},{"given":"Jidong","family":"Zhai","sequence":"additional","affiliation":[]},{"given":"Xiaosong","family":"Ma","sequence":"additional","affiliation":[]},{"given":"Mingliang","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Yan","family":"Zhai","sequence":"additional","affiliation":[]},{"given":"Wenguang","family":"Chen","sequence":"additional","affiliation":[]},{"given":"Weimin","family":"Zheng","sequence":"additional","affiliation":[]}],"member":"297","reference":[{"issue":"3","key":"3_CR1","doi-asserted-by":"publisher","first-page":"236","DOI":"10.1109\/TPDS.2003.1189582","volume":"14","author":"Y. Zhang","year":"2003","unstructured":"Zhang, Y., Franke, H., Moreira, J., Sivasubramaniam, A.: An integrated approach to parallel scheduling using gang-scheduling, backfilling, and migration. IEEE Transactions on Parallel and Distributed Systems\u00a014(3), 236\u2013247 (2003)","journal-title":"IEEE Transactions on Parallel and Distributed Systems"},{"issue":"2","key":"3_CR2","doi-asserted-by":"publisher","first-page":"146","DOI":"10.1145\/151244.151246","volume":"11","author":"C. McCann","year":"1993","unstructured":"McCann, C., Vaswani, R., Zahorjan, J.: A dynamic processor allocation policy for iviukiprogrammed shared-memory multiprocessors. ACM Transactions on Computer Systems\u00a011(2), 146\u2013178 (1993)","journal-title":"ACM Transactions on Computer Systems"},{"key":"3_CR3","doi-asserted-by":"crossref","unstructured":"Majumdar, S., Eager, D.L., Bunt, R.B.: Scheduling in multiprogrammed parallel systems, vol.\u00a016. ACM (1988)","DOI":"10.1145\/1007771.55608"},{"key":"3_CR4","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"295","DOI":"10.1007\/3-540-60153-8_35","volume-title":"Job Scheduling Strategies for Parallel Processing","author":"D. Lifka","year":"1995","unstructured":"Lifka, D.: The ANL\/IBM SP Scheduling System. In: Feitelson, D.G., Rudolph, L. (eds.) IPPS-WS 1995 and JSSPP 1995. LNCS, vol.\u00a0949, pp. 295\u2013303. Springer, Heidelberg (1995)"},{"key":"3_CR5","unstructured":"Platform\u00a0Computing Inc. Platform LSF (2012), http:\/\/www.platform.com\/products\/LSFfamily\/"},{"key":"3_CR6","unstructured":"Adaptive Computing\u00a0Enterprises Inc. MOAB workload manager (2012), http:\/\/www.supercluster.org\/moab\/"},{"key":"3_CR7","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"87","DOI":"10.1007\/3-540-45540-X_6","volume-title":"Job Scheduling Strategies for Parallel Processing","author":"D. Jackson","year":"2001","unstructured":"Jackson, D., Snell, Q., Clement, M.: Core Algorithms of the Maui Scheduler. In: Feitelson, D.G., Rudolph, L. (eds.) JSSPP 2001. LNCS, vol.\u00a02221, pp. 87\u2013102. Springer, Heidelberg (2001)"},{"key":"3_CR8","unstructured":"Adaptive Computing\u00a0Enterprises Inc. PBS\/Torque user manual (2012), http:\/\/www.clusterresources.com\/torquedocs21\/usersmanual.shtml"},{"key":"3_CR9","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"41","DOI":"10.1007\/BFb0022286","volume-title":"Job Scheduling Strategies for Parallel Processing","author":"J. Skovira","year":"1996","unstructured":"Skovira, J., Chan, W., Zhou, H., Lifka, D.: The EASY \u2013 LoadLeveler API Project. In: Feitelson, D.G., Rudolph, L. (eds.) IPPS-WS 1996 and JSSPP 1996. LNCS, vol.\u00a01162, pp. 41\u201347. Springer, Heidelberg (1996)"},{"key":"3_CR10","unstructured":"Parallel Workloads Archive (2012), http:\/\/www.cs.huji.ac.il\/labs\/parallel\/workload\/"},{"key":"3_CR11","doi-asserted-by":"crossref","unstructured":"Srinivasan, S., Kettimuthu, R., Subramani, V., Sadayappan, P.: Characterization of backfilling strategies for parallel job scheduling. In: Proceedings of the International Conference on Parallel Processing Workshops, pp. 514\u2013519. IEEE (2002)","DOI":"10.1109\/ICPPW.2002.1039773"},{"key":"3_CR12","doi-asserted-by":"crossref","unstructured":"Cirne, W., Berman, F.: A comprehensive model of the supercomputer workload. In: 2001 IEEE International Workshop on Workload Characterization, WWC-4, pp. 140\u2013148. IEEE (2001)","DOI":"10.1109\/WWC.2001.990753"},{"key":"3_CR13","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"103","DOI":"10.1007\/3-540-36180-4_7","volume-title":"Job Scheduling Strategies for Parallel Processing","author":"S.-H. Chiang","year":"2002","unstructured":"Chiang, S.-H., Arpaci-Dusseau, A., Vernon, M.K.: The Impact of More Accurate Requested Runtimes on Production Job Scheduling Performance. In: Feitelson, D.G., Rudolph, L., Schwiegelshohn, U. (eds.) JSSPP 2002. LNCS, vol.\u00a02537, pp. 103\u2013127. Springer, Heidelberg (2002)"},{"key":"3_CR14","doi-asserted-by":"publisher","first-page":"305","DOI":"10.1145\/1837853.1693493","volume":"45","author":"J. Zhai","year":"2010","unstructured":"Zhai, J., Chen, W., Zheng, W.: PHANTOM: predicting performance of parallel applications on large-scale parallel machines using a single node. ACM SIGPLAN Notices\u00a045, 305\u2013314 (2010)","journal-title":"ACM SIGPLAN Notices"},{"key":"3_CR15","doi-asserted-by":"crossref","unstructured":"Tang, W., Desai, N., Buettner, D., Lan, Z.: Analyzing and adjusting user runtime estimates to improve job scheduling on the Blue Gene\/P. In: 2010 IEEE International Symposium on Parallel & Distributed Processing (IPDPS), pp. 1\u201311. IEEE (2010)","DOI":"10.1109\/IPDPS.2010.5470474"},{"key":"3_CR16","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"253","DOI":"10.1007\/11407522_14","volume-title":"Job Scheduling Strategies for Parallel Processing","author":"C. Bailey Lee","year":"2005","unstructured":"Bailey Lee, C., Schwartzman, Y., Hardy, J., Snavely, A.: Are User Runtime Estimates Inherently Inaccurate? In: Feitelson, D.G., Rudolph, L., Schwiegelshohn, U. (eds.) JSSPP 2004. LNCS, vol.\u00a03277, pp. 253\u2013263. Springer, Heidelberg (2005)"},{"key":"3_CR17","unstructured":"Berkeley Lab Checkpoint\/Restart, BLCR (2012), https:\/\/ftg.lbl.gov\/projects\/CheckpointRestart\/"},{"key":"3_CR18","doi-asserted-by":"crossref","unstructured":"Bent, J., Gibson, G., Grider, G., McClelland, B., Nowoczynski, P., Nunez, J., Polte, M., Wingate, M.: Plfs: A checkpoint filesystem for parallel applications. In: Proceedings of the Conference on High Performance Computing Networking, Storage and Analysis, p. 21. ACM (2009)","DOI":"10.1145\/1654059.1654081"},{"key":"3_CR19","doi-asserted-by":"crossref","unstructured":"Liu, Y., Nassar, R., Leangsuksun, C., Naksinehaboon, N., Paun, M., Scott, S.L.: An optimal checkpoint\/restart model for a large scale high performance computing system. In: IEEE International Symposium on Parallel and Distributed Processing, IPDPS 2008, pp. 1\u20139. IEEE (2008)","DOI":"10.1109\/IPDPS.2008.4536279"},{"key":"3_CR20","doi-asserted-by":"crossref","unstructured":"Bronevetsky, G., Marques, D., Pingali, K., Stodghill, P.: Automated application-level checkpointing of MPI programs. In: Proceedings of the Ninth ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming, pp. 84\u201394. ACM (2003)","DOI":"10.1145\/781498.781513"},{"issue":"08","key":"3_CR21","first-page":"2690","volume":"2","author":"P.M. Mallikarjuna Shastry","year":"2010","unstructured":"Mallikarjuna Shastry, P.M., Venkatesh, K.: Analysis of Dependencies of Checkpoint Cost and Checkpoint Interval of Fault Tolerant MPI Applications. Analysis\u00a02(08), 2690\u20132697 (2010)","journal-title":"Analysis"},{"key":"3_CR22","unstructured":"TOP500 Supercomputing web site (2012), http:\/\/www.top500.org"},{"key":"3_CR23","doi-asserted-by":"crossref","unstructured":"Naik, H., Gupta, R., Beckman, P.: Analyzing checkpointing trends for applications on the IBM Blue Gene\/P system. In: International Conference on Parallel Processing Workshops, ICPPW 2009, pp. 81\u201388. IEEE (2009)","DOI":"10.1109\/ICPPW.2009.42"},{"key":"3_CR24","doi-asserted-by":"crossref","unstructured":"Feitelson, D.G., Weil, A.M.: Utilization and predictability in scheduling the ibm sp2 with backfilling. In: Proceedings of the First Merged International... and Symposium on Parallel and Distributed Processing, Parallel Processing Symposium, IPPS\/SPDP 1998, pp. 542\u2013546. IEEE (1998)","DOI":"10.1109\/IPPS.1998.669970"},{"key":"3_CR25","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"88","DOI":"10.1007\/3-540-36180-4_6","volume-title":"Job Scheduling Strategies for Parallel Processing","author":"W.A. Ward Jr.","year":"2002","unstructured":"Ward Jr., W.A., Mahood, C.L., West, J.E.: Scheduling Jobs on Parallel Systems Using a Relaxed Backfill Strategy. In: Feitelson, D.G., Rudolph, L., Schwiegelshohn, U. (eds.) JSSPP 2002. LNCS, vol.\u00a02537, pp. 88\u2013102. Springer, Heidelberg (2002)"},{"issue":"6","key":"3_CR26","doi-asserted-by":"publisher","first-page":"789","DOI":"10.1109\/TPDS.2007.70606","volume":"18","author":"D. Tsafrir","year":"2007","unstructured":"Tsafrir, D., Etsion, Y., Feitelson, D.G.: Backfilling using system-generated predictions rather than user runtime estimates. IEEE Transactions on Parallel and Distributed Systems\u00a018(6), 789\u2013803 (2007)","journal-title":"IEEE Transactions on Parallel and Distributed Systems"},{"key":"3_CR27","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"24","DOI":"10.1007\/3-540-36180-4_2","volume-title":"Job Scheduling Strategies for Parallel Processing","author":"Q.O. Snell","year":"2002","unstructured":"Snell, Q.O., Clement, M.J., Jackson, D.B.: Preemption Based Backfill. In: Feitelson, D.G., Rudolph, L., Schwiegelshohn, U. (eds.) JSSPP 2002. LNCS, vol.\u00a02537, pp. 24\u201337. Springer, Heidelberg (2002)"},{"key":"3_CR28","unstructured":"Adaptive Computing\u00a0Enterprises Inc. Preemption Policies (2012), http:\/\/www.adaptivecomputing.com\/resources\/docs\/maui\/8.4preemption.php"},{"key":"3_CR29","doi-asserted-by":"crossref","unstructured":"Perkovic, D., Keleher, P.J.: Randomization, speculation, and adaptation in batch schedulers. In: Proceedings of the 2000 ACM\/IEEE Conference on Supercomputing (CDROM), p. 7. IEEE Computer Society (2000)","DOI":"10.1109\/SC.2000.10041"},{"key":"3_CR30","doi-asserted-by":"crossref","unstructured":"Jette, M.A.: Performance characteristics of gang scheduling in multiprogrammed environments. In: ACM\/IEEE 1997 Conference on Supercomputing, pp. 54\u201354. IEEE (1997)","DOI":"10.1145\/509593.509647"},{"key":"3_CR31","unstructured":"Jette, M., Storch, D., Yim, E.: Gang scheduler-timesharing the cray t3d, pp. 247\u2013252. Cray User Group (1996)"},{"key":"3_CR32","unstructured":"Sosa, C., Knudson, B.: IBM System Blue Gene\/P Solution: Blue Gene\/P Application Development (2007), http:\/\/www.redbooks.ibm.com\/abstracts\/sg247287.html"},{"key":"3_CR33","doi-asserted-by":"crossref","unstructured":"Xue, R., Chen, W., Zheng, W.: CprFS: a user-level file system to support consistent file states for checkpoint and restart. In: Proceedings of the 22nd Annual International Conference on Supercomputing, pp. 114\u2013123. ACM (2008)","DOI":"10.1145\/1375527.1375547"},{"key":"3_CR34","doi-asserted-by":"crossref","unstructured":"Liu, Y., Nassar, R., Leangsuksun, C., Naksinehaboon, N., Paun, M., Scott, S.L.: An optimal checkpoint\/restart model for a large scale high performance computing system. In: IEEE International Symposium on Parallel and Distributed Processing, IPDPS 2008, pp. 1\u20139. IEEE (2008)","DOI":"10.1109\/IPDPS.2008.4536279"}],"container-title":["Lecture Notes in Computer Science","Job Scheduling Strategies for Parallel Processing"],"original-title":[],"link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-642-35867-8_3.pdf","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,4,29]],"date-time":"2025-04-29T16:50:56Z","timestamp":1745945456000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/978-3-642-35867-8_3"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2013]]},"ISBN":["9783642358661","9783642358678"],"references-count":34,"URL":"https:\/\/doi.org\/10.1007\/978-3-642-35867-8_3","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2013]]}}}