{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,21]],"date-time":"2026-01-21T07:40:34Z","timestamp":1768981234006,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":40,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,3,25]],"date-time":"2023-03-25T00:00:00Z","timestamp":1679702400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"CCF","award":["2112562,1955246"],"award-info":[{"award-number":["2112562,1955246"]}]},{"DOI":"10.13039\/100000183","name":"Army Research Office","doi-asserted-by":"publisher","award":["W911NF-19-2-0107"],"award-info":[{"award-number":["W911NF-19-2-0107"]}],"id":[{"id":"10.13039\/100000183","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,3,25]]},"DOI":"10.1145\/3582016.3582017","type":"proceedings-article","created":{"date-parts":[[2023,3,20]],"date-time":"2023-03-20T16:59:03Z","timestamp":1679331543000},"page":"134-146","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["DefT: Boosting Scalability of Deformable Convolution Operations on GPUs"],"prefix":"10.1145","author":[{"given":"Edward","family":"Hanson","sequence":"first","affiliation":[{"name":"Duke University, Durham, USA"}]},{"given":"Mark","family":"Horton","sequence":"additional","affiliation":[{"name":"Duke University, Durham, USA"}]},{"given":"Hai (Helen)","family":"Li","sequence":"additional","affiliation":[{"name":"Duke University, Durham, USA"}]},{"given":"Yiran","family":"Chen","sequence":"additional","affiliation":[{"name":"Duke University, Durham, USA"}]}],"member":"320","published-online":{"date-parts":[[2023,3,25]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"2022. Nsight compute. https:\/\/docs.nvidia.com\/nsight-compute\/NsightCompute\/index.html \t\t\t\t  2022. Nsight compute. https:\/\/docs.nvidia.com\/nsight-compute\/NsightCompute\/index.html"},{"key":"e_1_3_2_1_2_1","unstructured":"2022. Nsight Systems v2022.2.1 User guide. https:\/\/docs.nvidia.com\/nsight-systems\/UserGuide\/index.html \t\t\t\t  2022. Nsight Systems v2022.2.1 User guide. https:\/\/docs.nvidia.com\/nsight-systems\/UserGuide\/index.html"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2016.7783725"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/3485137"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/3316781.3317936"},{"key":"e_1_3_2_1_6_1","volume-title":"Chen Change Loy, and Dahua Lin","author":"Chen Kai","year":"2019","unstructured":"Kai Chen , Jiaqi Wang , Jiangmiao Pang , Yuhang Cao , Yu Xiong , Xiaoxiao Li , Shuyang Sun , Wansen Feng , Ziwei Liu , Jiarui Xu , Zheng Zhang , Dazhi Cheng , Chenchen Zhu , Tianheng Cheng , Qijie Zhao , Buyu Li , Xin Lu , Rui Zhu , Yue Wu , Jifeng Dai , Jingdong Wang , Jianping Shi , Wanli Ouyang , Chen Change Loy, and Dahua Lin . 2019 . MMDetection: Open MMLab Detection Toolbox and Benchmark . https:\/\/doi.org\/10.48550\/ARXIV.1906.07155 10.48550\/ARXIV.1906.07155 Kai Chen, Jiaqi Wang, Jiangmiao Pang, Yuhang Cao, Yu Xiong, Xiaoxiao Li, Shuyang Sun, Wansen Feng, Ziwei Liu, Jiarui Xu, Zheng Zhang, Dazhi Cheng, Chenchen Zhu, Tianheng Cheng, Qijie Zhao, Buyu Li, Xin Lu, Rui Zhu, Yue Wu, Jifeng Dai, Jingdong Wang, Jianping Shi, Wanli Ouyang, Chen Change Loy, and Dahua Lin. 2019. MMDetection: Open MMLab Detection Toolbox and Benchmark. https:\/\/doi.org\/10.48550\/ARXIV.1906.07155"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/JSSC.2016.2616357"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/JETCAS.2019.2910232"},{"key":"e_1_3_2_1_9_1","volume-title":"Advances in Neural Information Processing Systems","author":"Chi Lu","year":"2020","unstructured":"Lu Chi , Borui Jiang , and Yadong Mu. 2020. Fast Fourier Convolution . In Advances in Neural Information Processing Systems , H. Larochelle, M. Ranzato, R. Hadsell, M.F. Balcan, and H. Lin (Eds.). 33, Curran Associates, Inc. , 4479\u20134488. https:\/\/proceedings.neurips.cc\/paper\/ 2020 \/file\/2fd5d41ec6cfab47e32164d5624269b1-Paper.pdf Lu Chi, Borui Jiang, and Yadong Mu. 2020. Fast Fourier Convolution. In Advances in Neural Information Processing Systems, H. Larochelle, M. Ranzato, R. Hadsell, M.F. Balcan, and H. Lin (Eds.). 33, Curran Associates, Inc., 4479\u20134488. https:\/\/proceedings.neurips.cc\/paper\/2020\/file\/2fd5d41ec6cfab47e32164d5624269b1-Paper.pdf"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/3453688.3461480"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.89"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/TITS.2019.2939832"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3297858.3304014"},{"key":"#cr-split#-e_1_3_2_1_15_1.1","unstructured":"Ali Hassani Steven Walton Jiachen Li Shen Li and Humphrey Shi. 2022. Neighborhood Attention Transformer. https:\/\/doi.org\/10.48550\/ARXIV.2204.07143 10.48550\/ARXIV.2204.07143"},{"key":"#cr-split#-e_1_3_2_1_15_1.2","unstructured":"Ali Hassani Steven Walton Jiachen Li Shen Li and Humphrey Shi. 2022. Neighborhood Attention Transformer. https:\/\/doi.org\/10.48550\/ARXIV.2204.07143"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.322"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3431920.3439295"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/3140659.3080246"},{"key":"e_1_3_2_1_20_1","volume-title":"FLAT: An Optimized Dataflow for Mitigating Attention Bottlenecks. https:\/\/doi.org\/10.48550\/ARXIV.2107.06419","author":"Kao Sheng-Chun","year":"2021","unstructured":"Sheng-Chun Kao , Suvinay Subramanian , Gaurav Agrawal , Amir Yazdanbakhsh , and Tushar Krishna . 2021 . FLAT: An Optimized Dataflow for Mitigating Attention Bottlenecks. https:\/\/doi.org\/10.48550\/ARXIV.2107.06419 10.48550\/ARXIV.2107.06419 Sheng-Chun Kao, Suvinay Subramanian, Gaurav Agrawal, Amir Yazdanbakhsh, and Tushar Krishna. 2021. FLAT: An Optimized Dataflow for Mitigating Attention Bottlenecks. https:\/\/doi.org\/10.48550\/ARXIV.2107.06419"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3352460.3358252"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA51647.2021.00070"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/3466752.3480043"},{"key":"#cr-split#-e_1_3_2_1_24_1.1","unstructured":"Tsung-Yi Lin Michael Maire Serge Belongie Lubomir Bourdev Ross Girshick James Hays Pietro Perona Deva Ramanan C. Lawrence Zitnick and Piotr Doll\u00e1r. 2014. Microsoft COCO: Common Objects in Context. https:\/\/doi.org\/10.48550\/ARXIV.1405.0312 10.48550\/ARXIV.1405.0312"},{"key":"#cr-split#-e_1_3_2_1_24_1.2","unstructured":"Tsung-Yi Lin Michael Maire Serge Belongie Lubomir Bourdev Ross Girshick James Hays Pietro Perona Deva Ramanan C. Lawrence Zitnick and Piotr Doll\u00e1r. 2014. Microsoft COCO: Common Objects in Context. https:\/\/doi.org\/10.48550\/ARXIV.1405.0312"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/1542275.1542313"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/3453483.3454083"},{"key":"e_1_3_2_1_28_1","unstructured":"Nvidia Corporation. 2020. Nvidia Ampere GA102 GPU Architecture. \t\t\t\t  Nvidia Corporation. 2020. Nvidia Ampere GA102 GPU Architecture."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/CSDE53843.2021.9718419"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1186\/S40537-019-0197-0"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01231-1_33"},{"key":"e_1_3_2_1_32_1","volume-title":"Unifying Data, Model and Hybrid Parallelism in Deep Learning via Tensor Tiling. CoRR, abs\/1805.04170","author":"Wang Minjie","year":"2018","unstructured":"Minjie Wang , Chien-Chin Huang , and Jinyang Li. 2018. Unifying Data, Model and Hybrid Parallelism in Deep Learning via Tensor Tiling. CoRR, abs\/1805.04170 ( 2018 ), arXiv:1805.04170. arxiv:1805.04170 Minjie Wang, Chien-Chin Huang, and Jinyang Li. 2018. Unifying Data, Model and Hybrid Parallelism in Deep Learning via Tensor Tiling. CoRR, abs\/1805.04170 (2018), arXiv:1805.04170. arxiv:1805.04170"},{"key":"e_1_3_2_1_33_1","unstructured":"Yuxin Wu Alexander Kirillov Francisco Massa Wan-Yen Lo and Ross Girshick. 2019. Detectron2. https:\/\/github.com\/facebookresearch\/detectron2 \t\t\t\t  Yuxin Wu Alexander Kirillov Francisco Massa Wan-Yen Lo and Ross Girshick. 2019. Detectron2. https:\/\/github.com\/facebookresearch\/detectron2"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00475"},{"key":"#cr-split#-e_1_3_2_1_35_1.1","unstructured":"Dawen Xu Cheng Chu Cheng Liu Ying Wang Huawei Li Xiaowei Li and Kwang-Ting Cheng. 2021. Energy-Efficient Accelerator Design for Deformable Convolution Networks. https:\/\/doi.org\/10.48550\/ARXIV.2107.02547 10.48550\/ARXIV.2107.02547"},{"key":"#cr-split#-e_1_3_2_1_35_1.2","unstructured":"Dawen Xu Cheng Chu Cheng Liu Ying Wang Huawei Li Xiaowei Li and Kwang-Ting Cheng. 2021. Energy-Efficient Accelerator Design for Deformable Convolution Networks. https:\/\/doi.org\/10.48550\/ARXIV.2107.02547"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW56347.2022.00309"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00953"}],"event":{"name":"ASPLOS '23: 28th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 3","location":"Vancouver BC Canada","acronym":"ASPLOS '23","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture","SIGOPS ACM Special Interest Group on Operating Systems","SIGPLAN ACM Special Interest Group on Programming Languages","SIGBED ACM Special Interest Group on Embedded Systems"]},"container-title":["Proceedings of the 28th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 3"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3582016.3582017","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T16:46:44Z","timestamp":1750178804000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3582016.3582017"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,3,25]]},"references-count":40,"alternative-id":["10.1145\/3582016.3582017","10.1145\/3582016"],"URL":"https:\/\/doi.org\/10.1145\/3582016.3582017","relation":{},"subject":[],"published":{"date-parts":[[2023,3,25]]},"assertion":[{"value":"2023-03-25","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}