{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,11]],"date-time":"2026-03-11T16:41:33Z","timestamp":1773247293491,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":29,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,3,1]],"date-time":"2025-03-01T00:00:00Z","timestamp":1740787200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,3]]},"DOI":"10.1145\/3725798.3725807","type":"proceedings-article","created":{"date-parts":[[2025,5,13]],"date-time":"2025-05-13T10:40:50Z","timestamp":1747132850000},"page":"55-61","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["Efficient Parallel Implementation of Non-Local Means Algorithm on GPU"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-6933-6491","authenticated-orcid":false,"given":"Xiang","family":"Li","sequence":"first","affiliation":[{"name":"Nanjing university, Nanjing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4447-0480","authenticated-orcid":false,"given":"Qiong","family":"Chang","sequence":"additional","affiliation":[{"name":"Institute of Science Tokyo, Tokyo, Japan"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1753-7317","authenticated-orcid":false,"given":"Yun","family":"Li","sequence":"additional","affiliation":[{"name":"Nanjing university, Nanjing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3038-7678","authenticated-orcid":false,"given":"Jun","family":"Miyazaki","sequence":"additional","affiliation":[{"name":"Institute of Science Tokyo, Tokyo, Japan"}]}],"member":"320","published-online":{"date-parts":[[2025,5,13]]},"reference":[{"key":"e_1_3_3_1_2_2","doi-asserted-by":"crossref","unstructured":"Khalid Ahmad Cris Cecka Michael Garland and Mary Hall. 2024. Exploring data layout for sparse tensor times dense matrix on GPUs. ACM Transactions on Architecture and Code Optimization 21 1 (2024) 1\u201320.","DOI":"10.1145\/3633462"},{"key":"e_1_3_3_1_3_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2005.38"},{"key":"e_1_3_3_1_4_2","doi-asserted-by":"crossref","unstructured":"Qiong Chang Xiang Li Yun Li and Jun Miyazaki. 2023. Multi-directional Sobel operator kernel on GPUs. J. Parallel and Distrib. Comput. 177 (2023) 160\u2013170.","DOI":"10.1016\/j.jpdc.2023.03.004"},{"key":"e_1_3_3_1_5_2","doi-asserted-by":"crossref","unstructured":"P. Coup\u00e9 P. Yger S. Prima P. Hellier C. Kervrann and C. Barillot. 2008. An optimized blockwise nonlocal means denoising filter for 3-D magnetic resonance images. IEEE Transactions on Medical Imaging 27 4 (2008) 425\u2013441.","DOI":"10.1109\/TMI.2007.906087"},{"key":"e_1_3_3_1_6_2","doi-asserted-by":"crossref","unstructured":"Davide Cozzolino Luisa Verdoliva Giuseppe Scarpa and Giovanni Poggi. 2020. Nonlocal CNN SAR image despeckling. Remote Sensing 12 6 (2020) 1006.","DOI":"10.3390\/rs12061006"},{"key":"e_1_3_3_1_7_2","doi-asserted-by":"crossref","unstructured":"Cristovao Cruz Alessandro Foi Vladimir Katkovnik and Karen Egiazarian. 2018. Nonlocality-reinforced convolutional neural networks for image denoising. IEEE Signal Processing Letters 25 8 (2018) 1216\u20131220.","DOI":"10.1109\/LSP.2018.2850222"},{"key":"e_1_3_3_1_8_2","doi-asserted-by":"publisher","DOI":"10.1109\/WAINA.2016.110"},{"key":"e_1_3_3_1_9_2","doi-asserted-by":"crossref","unstructured":"Axel Davy and Thibaud Ehret. 2021. GPU acceleration of NLM BM3D and VBM3D. Journal of Real-Time Image Processing 18 1 (2021) 57\u201374.","DOI":"10.1007\/s11554-020-00945-4"},{"key":"e_1_3_3_1_10_2","doi-asserted-by":"crossref","unstructured":"Linwei Fan Fan Zhang Hui Fan and Caiming Zhang. 2019. Brief review of image denoising techniques. Visual Computing for Industry Biomedicine and Art 2 1 (2019) 7.","DOI":"10.1186\/s42492-019-0016-7"},{"key":"e_1_3_3_1_11_2","doi-asserted-by":"crossref","unstructured":"Bhawna Goyal Ayush Dogra Sunil Agrawal Balwinder\u00a0Singh Sohi and Apoorav Sharma. 2020. Image denoising review: From classical to state-of-the-art approaches. Information Fusion 55 (2020) 220\u2013244.","DOI":"10.1016\/j.inffus.2019.09.003"},{"key":"e_1_3_3_1_12_2","doi-asserted-by":"crossref","unstructured":"Donatella Granata Umberto Amato and Bruno Alfano. 2019. MRI denoising by nonlocal means on multi-GPU. Journal of Real-Time Image Processing 16 2 (2019) 523\u2013533.","DOI":"10.1007\/s11554-016-0566-2"},{"key":"e_1_3_3_1_13_2","doi-asserted-by":"crossref","unstructured":"Gregory Herschlag Seyong Lee Jeffrey\u00a0S. Vetter and Amanda Randles. 2021. Analysis of GPU data access patterns on complex geometries for the D3Q19 lattice Boltzmann algorithm. IEEE Transactions on Parallel and Distributed Systems 32 10 (2021) 2400\u20132414.","DOI":"10.1109\/TPDS.2021.3061895"},{"key":"e_1_3_3_1_14_2","doi-asserted-by":"crossref","unstructured":"J. Hu J. Zhou and X. Wu. 2016. Non-local MRI denoising using random sampling. Magnetic Resonance Imaging 34 7 (2016) 990\u2013999.","DOI":"10.1016\/j.mri.2016.04.008"},{"key":"e_1_3_3_1_15_2","first-page":"263","volume-title":"MIPPR 2009: Medical Imaging, Parallel Processing of Images, and Optimization Techniques","author":"Huang Kuidong","year":"2009","unstructured":"Kuidong Huang, Dinghua Zhang, and Kai Wang. 2009. Non-local means denoising algorithm accelerated by GPU. In MIPPR 2009: Medical Imaging, Parallel Processing of Images, and Optimization Techniques, Vol.\u00a07497. SPIE, 263\u2013270."},{"key":"e_1_3_3_1_16_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICTA53157.2021.9661666"},{"key":"e_1_3_3_1_17_2","unstructured":"Khronos Group. 2023. OpenCL Specification. [Online]. Available: https:\/\/registry.khronos.org\/OpenCL\/specs\/3.0-unified\/pdf\/OpenCL_API.pdf."},{"key":"e_1_3_3_1_18_2","first-page":"681","volume-title":"Parallel Computing: Technology Trends","author":"Koizumi Hayato","year":"2020","unstructured":"Hayato Koizumi and Tsutomu Maruyama. 2020. An Implementation of Non-Local Means Algorithm on FPGA. In Parallel Computing: Technology Trends. IOS Press, 681\u2013690."},{"key":"e_1_3_3_1_19_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00727"},{"key":"e_1_3_3_1_20_2","doi-asserted-by":"crossref","unstructured":"X. Li Q. Chang A. Zha S. Chang Y. Li and J. Miyazaki. 2024. An Optimized GPU Implementation for GIST Descriptor. ACM Transactions on Architecture and Code Optimization 21 4 (2024) 1\u201324.","DOI":"10.1145\/3689339"},{"key":"e_1_3_3_1_21_2","doi-asserted-by":"crossref","unstructured":"Gangzhao Lu Weizhe Zhang and Zheng Wang. 2021. Optimizing depthwise separable convolution operations on GPUs. IEEE Transactions on Parallel and Distributed Systems 33 1 (2021) 70\u201387.","DOI":"10.1109\/TPDS.2021.3084813"},{"key":"e_1_3_3_1_22_2","doi-asserted-by":"crossref","unstructured":"Gangzhao Lu Weizhe Zhang and Zheng Wang. 2021. Optimizing depthwise separable convolution operations on GPUs. IEEE Transactions on Parallel and Distributed Systems 33 1 (2021) 70\u201387.","DOI":"10.1109\/TPDS.2021.3084813"},{"key":"e_1_3_3_1_23_2","doi-asserted-by":"crossref","unstructured":"M. Mahmoudi and G. Sapiro. 2005. Fast image and video denoising via nonlocal means of similar neighborhoods. IEEE Signal Processing Letters 12 12 (2005) 839\u2013842.","DOI":"10.1109\/LSP.2005.859509"},{"key":"e_1_3_3_1_24_2","unstructured":"Nvidia Corporation. 2021. CUDA C Programming Guide. [Online]. Available: https:\/\/docs.nvidia.com\/cuda\/archive\/11.2.0\/cuda-c-programming-guide\/."},{"key":"e_1_3_3_1_25_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPEC43674.2020.9286196"},{"key":"e_1_3_3_1_26_2","doi-asserted-by":"crossref","unstructured":"Chunwei Tian Lunke Fei Wenxian Zheng Yong Xu Wangmeng Zuo and Chia-Wen Lin. 2020. Deep learning on image denoising: An overview. Neural Networks 131 (2020) 251\u2013275.","DOI":"10.1016\/j.neunet.2020.07.025"},{"key":"e_1_3_3_1_27_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00813"},{"key":"e_1_3_3_1_28_2","doi-asserted-by":"crossref","unstructured":"Weizhi Xu Yintai Sun Shengyu Fan Hui Yu and Xin Fu. 2023. Accelerating convolutional neural network by exploiting sparsity on GPUs. ACM Transactions on Architecture and Code Optimization 20 3 (2023) 1\u201326.","DOI":"10.1145\/3600092"},{"key":"e_1_3_3_1_29_2","unstructured":"Hao Zhang Feng Li Shilong Liu Lei Zhang Hang Su Jun Zhu Lionel\u00a0M. Ni and Heung-Yeung Shum. 2022. Dino: Detr with improved denoising anchor boxes for end-to-end object detection. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2203.03605 (2022)."},{"key":"e_1_3_3_1_30_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00068"}],"event":{"name":"GPGPU 2025: 17th Workshop on General Purpose Processing Using GPU","location":"Las Vegas NV USA","acronym":"GPGPU 2025"},"container-title":["Proceedings of the 17th Workshop on General Purpose Processing Using GPU"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3725798.3725807","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3725798.3725807","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:57:04Z","timestamp":1750298224000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3725798.3725807"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,3]]},"references-count":29,"alternative-id":["10.1145\/3725798.3725807","10.1145\/3725798"],"URL":"https:\/\/doi.org\/10.1145\/3725798.3725807","relation":{},"subject":[],"published":{"date-parts":[[2025,3]]},"assertion":[{"value":"2025-05-13","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}