{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,24]],"date-time":"2026-02-24T06:45:30Z","timestamp":1771915530019,"version":"3.50.1"},"publisher-location":"Singapore","reference-count":16,"publisher":"Springer Nature Singapore","isbn-type":[{"value":"9789819500055","type":"print"},{"value":"9789819500062","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-981-95-0006-2_43","type":"book-chapter","created":{"date-parts":[[2025,7,24]],"date-time":"2025-07-24T07:35:02Z","timestamp":1753342502000},"page":"514-524","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["FGSMS: Fine-Grained SM Scheduling for Efficient Deep Learning Computing"],"prefix":"10.1007","author":[{"given":"Nanjian","family":"Zhou","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Fan","family":"Wu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhizhuo","family":"Liu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chaonong","family":"Xu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,7,25]]},"reference":[{"key":"43_CR1","unstructured":"Chen, T., et al.: TVM: an automated end-to-end optimizing compiler for deep learning. In: 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 2018), pp. 578\u2013594 (2018)"},{"key":"43_CR2","doi-asserted-by":"crossref","unstructured":"Holmes, C., Mawhirter, D., He, Y., Yan, F., Wu, B.: GRNN: low-latency and scalable RNN inference on GPUs. In: Proceedings of the Fourteenth EuroSys Conference, pp. 1\u201316 (2019)","DOI":"10.1145\/3302424.3303949"},{"key":"43_CR3","unstructured":"Arpaci-Dusseau, R.H., Arpaci-Dusseau, A.C.: Operating systems: three easy pieces (2018)"},{"key":"43_CR4","first-page":"167","volume":"3","author":"Y Ding","year":"2021","unstructured":"Ding, Y., Zhu, L., Jia, Z., Pekhimenko, G., Han, S.: IOS: Inter-operator scheduler for CNN acceleration. Proc. Mach. Learn. Syst. 3, 167\u2013180 (2021)","journal-title":"Proc. Mach. Learn. Syst."},{"key":"43_CR5","first-page":"8343","volume":"33","author":"W Kwon","year":"2020","unstructured":"Kwon, W., Gyeong-In, Y., Jeong, E., Chun, B.-G.: Nimble: lightweight and parallel GPU task scheduling for deep learning. Adv. Neural. Inf. Process. Syst. 33, 8343\u20138354 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"43_CR6","doi-asserted-by":"crossref","unstructured":"Gupta, K., Stuart, J.A., Owens, J.D.: A study of persistent threads style GPU programming for GPGPU workloads. IEEE (2012)","DOI":"10.1109\/InPar.2012.6339596"},{"key":"43_CR7","doi-asserted-by":"crossref","unstructured":"Zhao, H., et al.: Tacker: tensor-cuda core kernel fusion for improving the GPU utilization while ensuring QoS. In: 2022 IEEE International Symposium on High-Performance Computer Architecture (HPCA), pp. 800\u2013813. IEEE (2022)","DOI":"10.1109\/HPCA53966.2022.00064"},{"key":"43_CR8","doi-asserted-by":"crossref","unstructured":"Kay\u0131ran, O., Jog, A., Kandemir, M.T., Das, C.R.: Neither more nor less: optimizing thread-level parallelism for GPGPUs. In: Proceedings of the 22nd International Conference on Parallel Architectures and Compilation Techniques, pp. 157\u2013166. IEEE (2013)","DOI":"10.1109\/PACT.2013.6618806"},{"key":"43_CR9","unstructured":"Abadi, M., et al.: TensorFlow: a system for large-scale machine learning. In: 12th USENIX Symposium on Operating Systems Design and Implementation (OSDI 2016), pp. 265\u2013283 (2016)"},{"key":"43_CR10","doi-asserted-by":"crossref","unstructured":"Zheng, Z., et al.: Astitch: enabling a new multi-dimensional optimization space for memory-intensive ML training and inference on modern SIMT architectures. In: Proceedings of the 27th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, pp. 359\u2013373 (2022)","DOI":"10.1145\/3503222.3507723"},{"key":"43_CR11","first-page":"1","volume":"4","author":"J Zhao","year":"2022","unstructured":"Zhao, J., et al.: Apollo: automatic partition-based operator fusion through layer by layer optimization. Proc. Mach. Learn. Syst. 4, 1\u201319 (2022)","journal-title":"Proc. Mach. Learn. Syst."},{"key":"43_CR12","doi-asserted-by":"crossref","unstructured":"Zheng, S., et al.: Chimera: an analytical optimizing framework for effective compute-intensive operators fusion. In: 2023 IEEE International Symposium on High-Performance Computer Architecture (HPCA), pp. 1113\u20131126. IEEE (2023)","DOI":"10.1109\/HPCA56546.2023.10071018"},{"key":"43_CR13","unstructured":"Chen, T., et al.: Learning to optimize tensor programs. In: Advances in Neural Information Processing Systems, vol. 31 (2018)"},{"issue":"6","key":"43_CR14","doi-asserted-by":"publisher","first-page":"519","DOI":"10.1145\/2499370.2462176","volume":"48","author":"J Ragan-Kelley","year":"2013","unstructured":"Ragan-Kelley, J., Barnes, C., Adams, A., Paris, S., Durand, F., Amarasinghe, S.: Halide: a language and compiler for optimizing parallelism, locality, and recomputation in image processing pipelines. ACM SIGPLAN Not. 48(6), 519\u2013530 (2013)","journal-title":"ACM SIGPLAN Not."},{"key":"43_CR15","doi-asserted-by":"crossref","unstructured":"Belviranli, M.E., Lee, S., Vetter, J.S., Bhuyan, L.N.: Juggler: a dependence-aware task-based execution framework for GPUs. In: Proceedings of the 23rd ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming, pp. 54\u201367 (2018)","DOI":"10.1145\/3178487.3178492"},{"key":"43_CR16","doi-asserted-by":"crossref","unstructured":"Wu, B., Chen, G., Li, D., Shen, X., Vetter, J.: Enabling and exploiting flexible task assignment on GPU through SM-centric program transformations. In: Proceedings of the 29th ACM on International Conference on Supercomputing, pp. 119\u2013130 (2015)","DOI":"10.1145\/2751205.2751213"}],"container-title":["Lecture Notes in Computer Science","Advanced Intelligent Computing Technology and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-95-0006-2_43","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,2,24]],"date-time":"2026-02-24T02:58:42Z","timestamp":1771901922000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-95-0006-2_43"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"ISBN":["9789819500055","9789819500062"],"references-count":16,"URL":"https:\/\/doi.org\/10.1007\/978-981-95-0006-2_43","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025]]},"assertion":[{"value":"25 July 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICIC","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Intelligent Computing","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Ningbo","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"26 July 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 July 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"21","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"icic2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/www.ic-icc.cn\/icg\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}