{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,26]],"date-time":"2025-03-26T07:14:06Z","timestamp":1742973246627,"version":"3.40.3"},"publisher-location":"Cham","reference-count":23,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031661457"},{"type":"electronic","value":"9783031661464"}],"license":[{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024]]},"DOI":"10.1007\/978-3-031-66146-4_2","type":"book-chapter","created":{"date-parts":[[2024,8,1]],"date-time":"2024-08-01T17:02:49Z","timestamp":1722531769000},"page":"18-32","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["An Approach Towards Distributed DNN Training on\u00a0FPGA Clusters"],"prefix":"10.1007","author":[{"given":"Philipp","family":"Kreowsky","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Justin","family":"Knapheide","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Benno","family":"Stabernack","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,8,1]]},"reference":[{"issue":"4","key":"2_CR1","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3320060","volume":"52","author":"T Ben-Nun","year":"2019","unstructured":"Ben-Nun, T., Hoefler, T.: Demystifying parallel and distributed deep learning: an in-depth concurrency analysis. ACM Comput. Surv. (CSUR) 52(4), 1\u201343 (2019)","journal-title":"ACM Comput. Surv. (CSUR)"},{"key":"2_CR2","doi-asserted-by":"crossref","unstructured":"Narayanan, D., et al.: Pipedream: generalized pipeline parallelism for dnn training. In: Proceedings of the 27th ACM Symposium on Operating Systems Principles, SOSP \u201919, pp. 1\u201315. Association for Computing Machinery, New York (2019)","DOI":"10.1145\/3341301.3359646"},{"key":"2_CR3","doi-asserted-by":"crossref","unstructured":"Unnikrishnan, N.K., Parhi, K.K.: LayerPipe: accelerating deep neural network training by intra-layer and inter-layer gradient pipelining and multiprocessor scheduling. In: 2021 IEEE\/ACM International Conference on Computer Aided Design (ICCAD). IEEE (2021)","DOI":"10.1109\/ICCAD51958.2021.9643567"},{"key":"2_CR4","unstructured":"Huang, Y., et al.: Gpipe: efficient training of giant neural networks using pipeline parallelism. In: Proceedings of the 33rd International Conference on Neural Information Processing Systems (2019). https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2019\/file\/093f65e080a295f8076b1c5722a46aa2-Paper.pdf"},{"key":"2_CR5","first-page":"1143","volume":"69","author":"T Wang","year":"2020","unstructured":"Wang, T., Geng, T., Li, A., Jin, X., Herbordt, M.: Fpdeep: scalable acceleration of CNN training on deeply-pipelined FPGA clusters. IEEE Trans. Comput. 69, 1143\u20131158 (2020)","journal-title":"IEEE Trans. Comput."},{"volume-title":"Heterogeneous Computing Architectures","year":"2019","key":"2_CR6","unstructured":"Terzo, O., Djemame, K., Scionti, A., Pezuela, C. (eds.): Heterogeneous Computing Architectures. CRC Press, Boca Raton (2019)"},{"key":"2_CR7","unstructured":"Lacey, G., Taylor, G.W.,\u00a0Areibi, S.: Deep learning on FPGAs: past, present, and future (2016)"},{"key":"2_CR8","doi-asserted-by":"crossref","unstructured":"Zhang, C.,\u00a0Wu, D.,\u00a0Sun, J.,\u00a0Sun, G.,\u00a0Luo, G.,\u00a0Cong, J.: Energy-efficient CNN implementation on a deeply pipelined FPGA cluster. In: Proceedings of the 2016 International Symposium on Low Power Electronics and Design. ACM (2016)","DOI":"10.1145\/2934583.2934644"},{"key":"2_CR9","doi-asserted-by":"publisher","first-page":"7823","DOI":"10.1109\/ACCESS.2018.2890150","volume":"7","author":"A Shawahna","year":"2019","unstructured":"Shawahna, A., Sait, S.M., El-Maleh, A.: FPGA-based accelerators of deep learning networks for learning and classification: a review. IEEE Access 7, 7823\u20137859 (2019)","journal-title":"IEEE Access"},{"key":"2_CR10","doi-asserted-by":"publisher","first-page":"103691","DOI":"10.1109\/ACCESS.2021.3098730","volume":"9","author":"S Nambi","year":"2021","unstructured":"Nambi, S., Ullah, S., Sahoo, S.S., Lohana, A., Merchant, F., Kumar, A.: Expan (n) d: Exploring posits for efficient artificial neural network design in fpga-based systems. IEEE Access 9, 103691\u2013103708 (2021)","journal-title":"IEEE Access"},{"key":"2_CR11","doi-asserted-by":"crossref","unstructured":"Zhu, Y., He, Z., Jiang, W., Zeng, K., Zhou, J., Alonso, G.: Distributed recommendation inference on FPGA clusters, pp. 279\u2013285. IEEE, Dresden (2021)","DOI":"10.1109\/FPL53798.2021.00057"},{"issue":"3","key":"2_CR12","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3519598","volume":"19","author":"Z Choudhury","year":"2022","unstructured":"Choudhury, Z., Shrivastava, S., Ramapantulu, L., Purini, S.: An FPGA overlay for CNN inference with fine-grained flexible parallelism. ACM Trans. Arch. Code Optim. 19(3), 1\u201326 (2022)","journal-title":"ACM Trans. Arch. Code Optim."},{"key":"2_CR13","doi-asserted-by":"crossref","unstructured":"Zhao, W., et al.: F-CNN: an FPGA-based framework for training convolutional neural networks. In: 2016 IEEE 27th International Conference on Application-specific Systems, Architectures and Processors (ASAP), pp. 107\u2013114. IEEE, London (2016)","DOI":"10.1109\/ASAP.2016.7760779"},{"key":"2_CR14","doi-asserted-by":"crossref","unstructured":"Luo, C., Sit, M.-K.,\u00a0Fan, H.,\u00a0Liu, S.,\u00a0Luk, W.,\u00a0Guo, C.: Towards efficient deep neural network training by fpga-based batch-level parallelism. In: 2019 IEEE 27th Annual International Symposium on Field-Programmable Custom Computing Machines (FCCM), pp. 45\u201352. IEEE, San Diego (2019)","DOI":"10.1109\/FCCM.2019.00016"},{"key":"2_CR15","doi-asserted-by":"crossref","unstructured":"Itsubo, T.,\u00a0Koibuchi, M.,\u00a0Amano, H.,\u00a0Matsutani, H.: Accelerating deep learning using multiple GPUS and FPGA-based 10 gbe switch. In: 2020 28th Euromicro International Conference on Parallel, Distributed and Network-Based Processing (PDP), pp. 102\u2013109. IEEE, V\u00e4ster\u00e5s (2020)","DOI":"10.1109\/PDP50117.2020.00022"},{"key":"2_CR16","doi-asserted-by":"crossref","unstructured":"Lu, J.,\u00a0Lin, J.,\u00a0Wang, Z.: A reconfigurable DNN training accelerator on FPGA. In: 2020 IEEE Workshop on Signal Processing Systems (SiPS). IEEE (2020)","DOI":"10.1109\/SiPS50750.2020.9195234"},{"key":"2_CR17","doi-asserted-by":"crossref","unstructured":"Bottou, L.: Large-scale machine learning with stochastic gradient descent. In: Proceedings of COMPSTAT 2010, pp. 177\u2013186. Physica-Verlag HD (2010)","DOI":"10.1007\/978-3-7908-2604-3_16"},{"key":"2_CR18","doi-asserted-by":"crossref","unstructured":"Horowitz, M.: 1.1 computing\u2019s energy problem (and what we can do about it). In: 2014 IEEE International Solid-State Circuits Conference Digest of Technical Papers (ISSCC), pp. 10\u201314. IEEE, San Francisco (2014)","DOI":"10.1109\/ISSCC.2014.6757323"},{"key":"2_CR19","unstructured":"Ioffe, S.,\u00a0Szegedy, C.: Batch normalization: accelerating deep network training by reducing internal covariate shift. In: Proceedings of the 32nd International Conference on Machine Learning, Lille, France, 07\u201309 July 2015, vol.\u00a037, pp. 448\u2013456 (2015). https:\/\/proceedings.mlr.press\/v37\/ioffe15.html"},{"key":"2_CR20","unstructured":"Chiley, V., et al.: Online normalization for training neural networks. Adv. Neural Inf. Process. Syst. 32 (2019). https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2019\/file\/cb3ce9b06932da6faaa7fc70d5b5d2f4-Paper.pdf"},{"key":"2_CR21","doi-asserted-by":"crossref","unstructured":"Steinert, F., Schelten, N., Schulte, A., Stabernack, B.: Hardware and software components towards the integration of network-attached accelerators into data centers, pp. 149\u2013153. IEEE, Kranj (2020)","DOI":"10.1109\/DSD51259.2020.00033"},{"key":"2_CR22","unstructured":"IEEE Standard for Local and metropolitan area networks\u2013Media Access Control (MAC) Bridges and Virtual Bridged Local Area Networks\u2013Amendment 17: Priority-based Flow Control, Std"},{"key":"2_CR23","doi-asserted-by":"crossref","unstructured":"Sandler, M.,\u00a0Howard, A.,\u00a0Zhu, M.,\u00a0Zhmoginov, A., Chen, L.-C.: MobileNetV2: inverted residuals and linear bottlenecks. In: 2018 IEEE\/CVF Conference on Computer Vision and Pattern Recognition. IEEE (2018)","DOI":"10.1109\/CVPR.2018.00474"}],"container-title":["Lecture Notes in Computer Science","Architecture of Computing Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-66146-4_2","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,25]],"date-time":"2024-11-25T15:22:49Z","timestamp":1732548169000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-66146-4_2"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024]]},"ISBN":["9783031661457","9783031661464"],"references-count":23,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-66146-4_2","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024]]},"assertion":[{"value":"1 August 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ARCS","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Architecture of Computing Systems","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Potsdam","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Germany","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"13 May 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"15 May 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"37","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"arcs2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/arcs-conference.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}