{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,1,13]],"date-time":"2025-01-13T07:10:06Z","timestamp":1736752206355,"version":"3.32.0"},"publisher-location":"Berlin, Heidelberg","reference-count":42,"publisher":"Springer Berlin Heidelberg","isbn-type":[{"type":"print","value":"9783540654148"},{"type":"electronic","value":"9783540492610"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[1998]]},"DOI":"10.1007\/bfb0095364","type":"book-chapter","created":{"date-parts":[[2006,11,24]],"date-time":"2006-11-24T14:05:49Z","timestamp":1164377149000},"page":"418-432","source":"Crossref","is-referenced-by-count":0,"title":["Deploying fault-tolerance and task migration with NetSolve"],"prefix":"10.1007","author":[{"given":"James S.","family":"Plank","sequence":"first","affiliation":[]},{"given":"Henri","family":"Casanova","sequence":"additional","affiliation":[]},{"given":"Micah","family":"Beck","sequence":"additional","affiliation":[]},{"given":"Jack","family":"Dongarra","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2006,10,20]]},"reference":[{"issue":"2","key":"54_CR1","doi-asserted-by":"crossref","first-page":"18","DOI":"10.1109\/2.485843","volume":"29","author":"C. Amza","year":"1996","unstructured":"C. Amza, A. L. Cox, S. Dwarkadas, P. Keleher, H. Lu, R. Rajamony, W. Yu, and W. Zwaenepoel. TreadMarks: Shared Memory Computing on Networks of Workstations, IEEE Computer, 29(2): 18\u201328, February, 1996.","journal-title":"IEEE Computer"},{"key":"54_CR2","volume-title":"LAPACK Users\u2019 Guide","author":"E. Anderson","year":"1995","unstructured":"E. Anderson, Z. Bai, C. Bischof, J. Demmel, J. Dongarra, J. Du Croz, A. Greenbaum, S. Hammarling, A. McKenney, S. Ostrouchov, and D. Sorensen, LAPACK Users\u2019 Guide, Second Edition, SIAM, Philadelphia, PA, 1995.","edition":"Second Edition"},{"issue":"3","key":"54_CR3","doi-asserted-by":"publisher","first-page":"287","DOI":"10.1109\/71.372777","volume":"6","author":"D. E. Bakken","year":"1995","unstructured":"D. E. Bakken and R. D. Schilchting. Supporting Fault-Tolerant Parallel Programming in Linda. IEEE Transactions on Parallel and Distributed Systems, 6(3):287\u2013302, March 1995.","journal-title":"IEEE Transactions on Parallel and Distributed Systems"},{"key":"54_CR4","doi-asserted-by":"crossref","unstructured":"A. Baratloo, P. Dasgupta, and Z. M. Kedem. Calypso: A Novel Software System for Fault-Tolerant Parallel Processing on Distributed Platoform. In 4th IEEE International Symposium on High Performance Distributed Computing, August 1995.","DOI":"10.1109\/HPDC.1995.518702"},{"key":"54_CR5","doi-asserted-by":"crossref","unstructured":"A. Beguelin, E. Seligman, and P. Stephan. Application Level Fault Tolerance in Heterogeneous Networks of Workstations. Journal of Parallel and Distributed Computing, September 1997.","DOI":"10.1006\/jpdc.1997.1338"},{"key":"54_CR6","doi-asserted-by":"crossref","DOI":"10.1137\/1.9780898719642","volume-title":"ScaLAPACK Users\u2019 Guide","author":"L. S. Blackford","year":"1997","unstructured":"L. S. Blackford, J. Choi, A. Cleary, E. D\u2019Azevedo, J. Demmel, I. Dhillon, J. Dongarra, S. Hammarling, G. Henry, A. Petitet, K. Stanley, D. Walker, and R. C. Whaley. ScaLAPACK Users\u2019 Guide. Society for Industrial and Applied Mathematics, Philadelphia, PA, 1997."},{"key":"54_CR7","doi-asserted-by":"crossref","unstructured":"D. Boley, G. H. Golub, S. Makar, N. Saxena, and E. J. McCluskey. Floating Point Fault Tolerance with Backward Error Assertions. IEEE Transactions on Computers, 44(2), February 1995.","DOI":"10.1109\/12.364541"},{"key":"54_CR8","doi-asserted-by":"crossref","unstructured":"G. Cabillic, G. Muller, and I. Puaut. The Performance of Consistent Checkpointing in Distributed Shared Memory Systems. In Proceedings of the 1995 European Intel Supercomputer Users\u2019 Group Meeting, 1995.","DOI":"10.1109\/RELDIS.1995.526217"},{"key":"54_CR9","unstructured":"H. Casanova and J. Dongarra. NetSolve\u2019s Network Enabled Server: Examples and Applications. IEEE Computational Science & Engineering, tp appear."},{"key":"54_CR10","unstructured":"J. Casas, D. L. Clark, P. S. Galbiati, R. Konuru, S. W. Otto, R. M. Prouty, and J. Walpole. MIST: PVM with transparent migration and checkpointing. In 3rd Annual PVM Users\u2019 Group Meeting, Pittsburgh, PA, May 1995."},{"key":"54_CR11","unstructured":"M. Castro, P. Guedes, M. Sequeira, and M. Costa. A checkpoint protocol for an entry consistent shared memory system. In Thirteenth ACM Symposium on Principles of Distributed Computing, Los Angeles, CA, August 1994."},{"key":"54_CR12","doi-asserted-by":"crossref","unstructured":"Y. Chen, J. S. Plank, and K. Li. CLIP: A Checkpointing Tool for Message-Passing Parallel Programs. In SC97: High Performance Networking and Computing, San Jose, November 1997.","DOI":"10.1145\/509593.509626"},{"key":"54_CR13","doi-asserted-by":"crossref","unstructured":"P. E. Chung, Y. Huang, S. Yajnik, G. Fowler, K. P. Vo, and Y. M. Wang. Checkpointing in CosMiC: a user-level process migration environment. In Pacific Rim International Symposium on Fault-Tolerant Systems, December 1997.","DOI":"10.1109\/PRFTS.1997.640146"},{"key":"54_CR14","doi-asserted-by":"crossref","unstructured":"D. Cummings and L. Alkalaj. Checkpoint\/Rollback in a Distributed System Using Coarse-Grained Dataflow. In 24th International Symposium on Fault-Tolerant Computing, pages 424\u2013433, Austin, TX, June 1994.","DOI":"10.1109\/FTCS.1994.315619"},{"key":"54_CR15","unstructured":"J. Czyzyk, M. Mesnier, and J. Mor\u00e9. NEOS: The Network-Enabled Optimization System. Technical Report MCS-P615-1096, Mathematics and Computer Science Division, Argonne National Laboratory, 1996."},{"key":"54_CR16","doi-asserted-by":"crossref","unstructured":"M. J. Feeley, W. E. Morgan, F. H. Pighin, A. R. Karlin, and H. M. Levy. Implementing Global Memory Management in a Workstation Cluster. In 15th Symposium on Operating Systems Principles, pages 201\u2013212. ACM, December 1995.","DOI":"10.1145\/224056.224072"},{"key":"54_CR17","unstructured":"I. Foster, C. Kesselman, C. Lee, G. von Laszewski, and P. Stelling. A Fault Detection Service for Wide Area Distributed Computations. In Proc. of the High Performance Distributed Computing Conference, to appear."},{"key":"54_CR18","doi-asserted-by":"crossref","unstructured":"I. Foster and K Kesselman. Globus: A Metacomputing Infrastructure Toolkit. In Proc. Workshop on Environments and Tools. SIAM, to appear.","DOI":"10.1177\/109434209701100205"},{"key":"54_CR19","unstructured":"A. Grimshaw, W. Wulf, J. French, A. Weaver, and P. Jr. Reynolds. A Synopsis of the Legion Project. Technical Report CS-94-20, Department of Computer Science, University of Virginia, 1994."},{"issue":"6","key":"54_CR20","doi-asserted-by":"crossref","first-page":"518","DOI":"10.1109\/TC.1984.1676475","volume":"33","author":"K-H. Huang","year":"1984","unstructured":"K-H. Huang and J. A. Abraham. Algorithm-Based Fault Tolerance for Matrix Operations. IEEE Transactions on Computers, C-33(6):518\u2013528, June 1984.","journal-title":"IEEE Transactions on Computers"},{"key":"54_CR21","unstructured":"The Math Works Inc. MATLAB Reference Guide. 1992."},{"key":"54_CR22","doi-asserted-by":"crossref","unstructured":"G. Janakiraman and Y. Tamir. Coordinated Checkpointing-Rollback Error Recovery for Distributed Shared Memory Multicomputers. In 13th Symposium on Reliable Distributed Systems, pages 42\u201351, October 1994.","DOI":"10.1109\/RELDIS.1994.336910"},{"key":"54_CR23","doi-asserted-by":"crossref","unstructured":"K. L. Johnson, M. F. Kaashoek, and D. A. Wallach. CRL: High-Performance All-Software Distributed Shared Memory. In 15th Symposium on Operating Systems Principles, pages 213\u2013228. ACM, December 1995.","DOI":"10.1145\/224056.224073"},{"key":"54_CR24","unstructured":"Y. Kim, J. S. Plank, and J. Dongarra. Fault Tolerant Matrix Operations using Checksum and Reverse Computation. In 6th Symposium on the Fontiers of Massively Parallel Computation, October 1996."},{"key":"54_CR25","volume-title":"Proc. of IEEE Workshop on Experimental Distributed Systems","author":"M. Litzkow","year":"1990","unstructured":"M. Litzkow and M. Livny. Experience with the Condor Distributed Batch System. In Proc. of IEEE Workshop on Experimental Distributed Systems. Department of Computer Science, University of Winsconsin, Madison, 1990."},{"key":"54_CR26","doi-asserted-by":"crossref","unstructured":"M. W. Mutka and M. Livny. The available capacity of a privately owned workstation environment. Perfomance Evaluation, August 1991.","DOI":"10.1016\/0166-5316(91)90005-N"},{"key":"54_CR27","doi-asserted-by":"crossref","unstructured":"V. K. Naik, S. P. Midkiff, and J. E. Moreira. A Checkpointing Strategy for Scalable Recovery on Distributed Parallel Systems. In SC97: High Performance Networking and Computing, San Jose, November 1997.","DOI":"10.1145\/509593.509625"},{"issue":"5","key":"54_CR28","doi-asserted-by":"publisher","first-page":"5","DOI":"10.1145\/37499.37502","volume":"21","author":"D. A. Nichols","year":"1987","unstructured":"D. A. Nichols. Using Idle Workstations in a Shared Computing Environment. Operating Systems Review: Proceedings of SOSP-11, 21(5):5\u201312, November 1987.","journal-title":"Operating Systems Review: Proceedings of SOSP-11"},{"key":"54_CR29","unstructured":"R. Orfali and D. Harkey. Client\/Server Programming with Java and CORBA. John Wiley & Sons, Inc, 1997."},{"key":"54_CR30","doi-asserted-by":"publisher","first-page":"125","DOI":"10.1006\/jpdc.1997.1336","volume":"43","author":"J. S. Plank","year":"1997","unstructured":"J. S. Plank, Y. Kim, and J. Dongarra. Fault Tolerant Matrix Operations for Networks of Workstations Using Diskless Checkpointing. Journal of Parallel and Distributed Computing, 43:125\u2013138, September 1997.","journal-title":"Journal of Parallel and Distributed Computing"},{"key":"54_CR31","doi-asserted-by":"crossref","unstructured":"J. Pruyne and M. Livny. Parallel Processing on Dynamic Resources with CARMI. In First IPPS Workshop on Job Scheduling Strategies for Parallel Processing, April 1995.","DOI":"10.1007\/3-540-60153-8_33"},{"key":"54_CR32","doi-asserted-by":"crossref","unstructured":"B. Ramkumar and V. Strumpen. Portable Checkpointing and Recovery in Heterogeneous Environments. In 27th International Symposium on Fault-Tolerant Computing, 1997.","DOI":"10.1109\/FTCS.1997.614078"},{"key":"54_CR33","unstructured":"D. J. Scales and M. S. Lam. Transparent Fault Tolerance for Parallel Applications on Networks of Workstations. In Usenix 1996 Technical Conference on UNIX and Advanced Computing Systems, San Diego, January 1996."},{"key":"54_CR34","unstructured":"S. Sekiguchi, M. Sato, H. Nakada, S. Matsuoka, and U. Nagashima. Ninf: Network based Information Library for Globally High Performance Computing. In Proc. of Parallel Object-Oriented Methods and Applications (POOMA), Santa Fe, 1996."},{"key":"54_CR35","doi-asserted-by":"crossref","unstructured":"L. M. Silva, J. G. Silva, S. Chapple, and L. Clarke. Portable Checkpointing and Recovery. In Proceedings of the HPDC-4, High-Performance Distributed Computing, pages 188\u2013195, Washington, DC, August 1995.","DOI":"10.1109\/HPDC.1995.518709"},{"key":"54_CR36","doi-asserted-by":"crossref","unstructured":"L. M. Silva, B. Veer, and J. G. Silva. Checkpointing SPMD Applications on Transputer Networks. In Scalable High Performance Computing Conference, pages 694\u2013701, Knoxville, TN, May 1994.","DOI":"10.1109\/SHPCC.1994.296709"},{"key":"54_CR37","doi-asserted-by":"crossref","unstructured":"B. Steensgaard and E. Jul. Object and native code thread mobility among heterogeneous computers. In 15th Symposium on Operating Systems Principles, pages 68\u201378. ACM, December 1995.","DOI":"10.1145\/224057.224063"},{"key":"54_CR38","doi-asserted-by":"crossref","unstructured":"G. Stellner. CoCheck: Checkpointing and Process Migration for MPI. In 10th International Parallel Processing Symposium, April 1996.","DOI":"10.1109\/IPPS.1996.508106"},{"key":"54_CR39","doi-asserted-by":"crossref","unstructured":"G. Suri, B. Janssens, and W. K. Fuchs. Reduced Overhead Logging for Rollback Recovery in Distributed Shared Memory. In 24th International Symposium on Fault-Tolerant Computing, pages 279\u2013288, June 1994.","DOI":"10.1109\/FTCS.1995.466971"},{"issue":"8","key":"54_CR40","doi-asserted-by":"publisher","first-page":"942","DOI":"10.1109\/12.609281","volume":"46","author":"N. H. Vaidya","year":"1997","unstructured":"N. H. Vaidya. Impact of Checkpoint Latency on Overhead Ratio of a Checkpointing Scheme. IEEE Transactions on Computers, 46(8):942\u2013947, August 1997.","journal-title":"IEEE Transactions on Computers"},{"key":"54_CR41","unstructured":"S. Wolfram. The Mathematical Book Third Edition. Wolfram Median, Inc. and Cambridge University Press, 1996."},{"key":"54_CR42","unstructured":"R. Wolski. Dynamically forecasting network performance to support dynamic scheduling using the Network Weather Service. In 6th High-Performance Distributed Computing Conference, August 1997."}],"container-title":["Lecture Notes in Computer Science","Applied Parallel Computing Large Scale Scientific and Industrial Problems"],"original-title":[],"link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/BFb0095364","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,1,12]],"date-time":"2025-01-12T04:19:25Z","timestamp":1736655565000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/BFb0095364"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[1998]]},"ISBN":["9783540654148","9783540492610"],"references-count":42,"URL":"https:\/\/doi.org\/10.1007\/bfb0095364","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[1998]]}}}