{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,5]],"date-time":"2026-03-05T15:44:35Z","timestamp":1772725475573,"version":"3.50.1"},"reference-count":173,"publisher":"IEEE","license":[{"start":{"date-parts":[[2026,1,31]],"date-time":"2026-01-31T00:00:00Z","timestamp":1769817600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,1,31]],"date-time":"2026-01-31T00:00:00Z","timestamp":1769817600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026,1,31]]},"DOI":"10.1109\/hpca68181.2026.11408457","type":"proceedings-article","created":{"date-parts":[[2026,3,4]],"date-time":"2026-03-04T20:47:22Z","timestamp":1772657242000},"page":"1-19","source":"Crossref","is-referenced-by-count":1,"title":["WATOS: Efficient LLM Training Strategies and Architecture Co-Exploration for Wafer-Scale Chip"],"prefix":"10.1109","author":[{"given":"Huizheng","family":"Wang","sequence":"first","affiliation":[{"name":"School of Integrated Circuits, BNRist Tsinghua University,Beijing,China,100084"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zichuan","family":"Wang","sequence":"additional","affiliation":[{"name":"School of Integrated Circuits, BNRist Tsinghua University,Beijing,China,100084"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hongbin","family":"Wang","sequence":"additional","affiliation":[{"name":"School of Integrated Circuits, BNRist Tsinghua University,Beijing,China,100084"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jingxiang","family":"Hou","sequence":"additional","affiliation":[{"name":"School of Integrated Circuits, BNRist Tsinghua University,Beijing,China,100084"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Taiquan","family":"Wei","sequence":"additional","affiliation":[{"name":"School of Integrated Circuits, BNRist Tsinghua University,Beijing,China,100084"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chao","family":"Li","sequence":"additional","affiliation":[{"name":"School of Computer Science and Engineering, Shanghai Jiao Tong University,Shanghai,China,200240"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yang","family":"Hu","sequence":"additional","affiliation":[{"name":"School of Integrated Circuits, BNRist Tsinghua University,Beijing,China,100084"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shouyi","family":"Yin","sequence":"additional","affiliation":[{"name":"School of Integrated Circuits, BNRist Tsinghua University,Beijing,China,100084"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/ECTC.2018.00288"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1145\/3582016.3582061"},{"key":"ref3","first-page":"1877","article-title":"Language models are few-shot learners","volume":"33","author":"Brown","year":"2020","journal-title":"Advances in neural information processing systems"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/ARITH.2019.00022"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1145\/3579371.3589048"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA57654.2024.00022"},{"key":"ref7","article-title":"Cerebras systems: Achieving industry best AI performance through a systems approach","volume-title":"Cerebras","year":"2021"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA59077.2024.00025"},{"key":"ref9","article-title":"Training deep nets with sublinear memory cost","author":"Chen","year":"2016","journal-title":"arXiv preprint"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1145\/2654822.2541967"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/JSSC.2016.2616357"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/ICNP61940.2024.10858570"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1023\/A:1009642405419"},{"key":"ref14","article-title":"Flashattention-2: Faster attention with better parallelism and work partitioning","author":"Dao","year":"2023","journal-title":"arXiv preprint"},{"key":"ref15","article-title":"Transformers are SSMs: Generalized models and efficient algorithms through structured state space duality","author":"Dao","year":"2024","journal-title":"arXiv preprint"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i25.34898"},{"key":"ref17","article-title":"Large scale distributed deep networks","volume":"25","author":"Dean","year":"2012","journal-title":"Advances in neural information processing systems"},{"key":"ref18","article-title":"Deepseek-v3 technical report","volume-title":"DeepSeek-AI","year":"2025"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/ISSCC49661.2025.10904499"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1145\/3580305.3599572"},{"key":"ref21","article-title":"Scaling rectified flow transformers for high-resolution image synthesis","volume-title":"Forty-first international conference on machine learning","author":"Esser","year":"2024"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1145\/3437801.3441593"},{"key":"ref23","article-title":"PALM: A efficient performance simulator for tiled accelerators with large-scale model training","author":"Fang","year":"2024","journal-title":"arXiv preprint"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA53966.2022.00074"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/SC41406.2024.00102"},{"key":"ref26","doi-asserted-by":"crossref","first-page":"930","DOI":"10.1145\/3613424.3614310","article-title":"Heterogeneous die-to-die interfaces: Enabling more flexible chiplet interconnection systems","volume-title":"Proceedings of the 56th Annual IEEE\/ACM International Symposium on Microarchitecture","author":"Feng","year":"2023"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1145\/3297858.3304014"},{"key":"ref28","article-title":"Pipeorgan: Efficient inter-operation pipelining with flexible spatial organization and interconnects","author":"Garg","year":"2024","journal-title":"arXiv preprint"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1145\/3620666.3651328"},{"key":"ref30","article-title":"Mamba: Linear-time sequence modeling with selective state spaces","volume-title":"First Conference on Language Modeling","author":"Gu","year":"2024"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1016\/j.mssp.2021.106182"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA57654.2024.00019"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1016\/j.fmre.2023.10.020"},{"key":"ref34","article-title":"WaferLLM: Large language model inference at wafer scale","author":"He","year":"2025","journal-title":"arXiv preprint"},{"key":"ref35","first-page":"505","article-title":"Campo: Cost-aware performance optimization for mixed-Precision neural network training","volume-title":"2022 USENIX Annual Technical Conference (USENIX ATC 22)","author":"He","year":"2022"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1145\/3445814.3446762"},{"key":"ref37","article-title":"New heuristic and metaheuristic approaches applied to the multiple-choice multidimensional knapsack problem","author":"Hiremath","year":"2008","journal-title":"Wright State University"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.4249\/scholarpedia.1482"},{"key":"ref39","doi-asserted-by":"crossref","first-page":"209","DOI":"10.1145\/3613424.3623797","article-title":"Dosa: Differentiable model-based one-loop search for DNN accelerators","volume-title":"Proceedings of the 56th Annual IEEE\/ACM International Symposium on Microarchitecture","author":"Hong","year":"2023"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS55109.2022.00039"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/ECTC51909.2023.00091"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/IITC61274.2024.10732470"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/MCAS.2024.3349669"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/ECTC51909.2023.00174"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA52012.2021.00023"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1145\/3725843.3756079"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/ECTC32696.2021.00028"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.48550\/arxiv.1811.06965"},{"key":"ref49","article-title":"Hecaton: Training large language models with scalable chiplet systems","author":"Huang","year":"2024","journal-title":"arXiv preprint"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1145\/3581784.3607102"},{"key":"ref51","first-page":"497","article-title":"Checkmate: Breaking the memory wall with optimal tensor rematerialization","volume-title":"Proceedings of Machine Learning and Systems","volume":"2","author":"Jain","year":"2020"},{"key":"ref52","first-page":"27","article-title":"Optimizing DNN computation with relaxed graph substitutions","volume-title":"Proceedings of Machine Learning and Systems","volume":"1","author":"Jia","year":"2019"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/tcad.2021.3096458"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1016\/j.eng.2023.11.023"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS61541.2024.00035"},{"key":"ref56","article-title":"A study of BFLOAT16 for deep learning training","author":"Kalamkar","year":"2019","journal-title":"arXiv preprint"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO50266.2020.00058"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA59077.2024.00045"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/ISSCC49661.2025.10904761"},{"key":"ref60","first-page":"1663916653","article-title":"Bpipe: Memory-balanced pipeline parallelism for training large language models","volume-title":"International Conference on Machine Learning. PMLR","author":"Kim","year":"2023"},{"key":"ref61","article-title":"Llmem: Estimating GPU memory usage for fine-tuning pretrained LLMs","author":"Kim","year":"2024","journal-title":"arXiv preprint"},{"key":"ref62","article-title":"DFModel: Design Space Optimization of Large-Scale Systems Exploiting Dataflow Mappings","author":"Ko","year":"2024","journal-title":"arXiv preprint"},{"key":"ref63","first-page":"341","article-title":"Reducing activation recomputation in large Transformer models","volume-title":"Proceedings of Machine Learning and Systems","volume":"5","author":"Korthikanti","year":"2023"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1145\/3352460.3358252"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA57654.2024.00069"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1109\/TCPMT.2023.3234007"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1109\/TCPMT.2021.3096786"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1109\/TCPMT.2020.2996658"},{"key":"ref69","article-title":"GShard: Scaling giant models with conditional computation and automatic sharding","author":"Lepikhin","year":"2020","journal-title":"arXiv preprint"},{"issue":"06","key":"ref70","first-page":"993","article-title":"Research on wafer-scale chip mapping task based on genetic algorithm","volume":"46","author":"LI","year":"2024","journal-title":"Computer Engineering & Science"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1145\/3695053.3731008"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1109\/DAC63849.2025.11132830"},{"key":"ref73","article-title":"Large language model inference acceleration: A comprehensive hardware perspective","author":"Li","year":"2024","journal-title":"arXiv preprint"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1145\/3605573.3605613"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476145"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1109\/HCS55958.2022.9895479"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2024.3386628"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1109\/21.257766"},{"key":"ref79","doi-asserted-by":"crossref","first-page":"631","DOI":"10.1145\/3489517.3530565","article-title":"Partition and place finite element model on wafer-scale engine","volume-title":"Proceedings of the 59th ACM\/IEEE Design Automation Conference","author":"Liu","year":"2022"},{"key":"ref80","doi-asserted-by":"publisher","DOI":"10.1145\/3581784.3607073"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1016\/S0377-2217(97)00388-3"},{"key":"ref82","doi-asserted-by":"publisher","DOI":"10.1145\/3581784.3627042"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.1109\/TETCI.2023.3336928"},{"key":"ref84","volume-title":"Knapsack problems: Algorithms and computer implementations","author":"Martello","year":"1990"},{"key":"ref85","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA56546.2023.10071098"},{"key":"ref86","article-title":"Llama 3 model card","volume-title":"Meta AI","year":"2024"},{"key":"ref87","article-title":"Llama-3.1\u2013405B","volume-title":"Meta AI","year":"2024"},{"key":"ref88","article-title":"Mixed precision training","author":"Micikevicius","year":"2017","journal-title":"arXiv preprint"},{"key":"ref89","doi-asserted-by":"publisher","DOI":"10.1109\/ISSCC49661.2025.10904793"},{"key":"ref90","doi-asserted-by":"publisher","DOI":"10.1109\/JPROC.1998.658762"},{"key":"ref91","doi-asserted-by":"publisher","DOI":"10.1109\/ISSCC19947.2020.9063103"},{"key":"ref92","first-page":"7937","article-title":"Memory-efficient pipeline-parallel DNN training","volume-title":"International Conference on Machine Learning. PMLR","author":"Narayanan","year":"2021"},{"key":"ref93","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3458817.3476209","article-title":"Efficient large-scale language model training on GPU clusters using Megatron-LM","volume-title":"Proceedings of the international conference for high performance computing, networking, storage and analysis","author":"Narayanan","year":"2021"},{"key":"ref94","first-page":"arXiv","article-title":"Memory-efficient training of LLMs with larger mini-batches","author":"Nguyen","year":"2024","journal-title":"arXiv e-prints"},{"key":"ref95","doi-asserted-by":"publisher","DOI":"10.1145\/1592568.1592575"},{"key":"ref96","article-title":"NVIDIA H100","volume-title":"NVIDIA","year":"2022"},{"key":"ref97","article-title":"Megatron-LM","volume-title":"NVIDIA","year":"2025"},{"key":"ref98","article-title":"NVIDIA GB300","volume-title":"NVIDIA","year":"2025"},{"key":"ref99","article-title":"NVIDIA GB200 NVL72: Powering the new era of computing","volume-title":"NVIDIA"},{"key":"ref100","article-title":"NVIDIA DGX GH200: The next-generation AI supercomputer for the generative AI era","volume-title":"NVIDIA, Tech. Rep.","year":"2023"},{"key":"ref101","article-title":"DCRA: A distributed chiplet-based reconfigurable architecture for irregular applications","author":"Orenes-Vera","year":"2023","journal-title":"arXiv preprint"},{"key":"ref102","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS61541.2024.00015"},{"key":"ref103","doi-asserted-by":"publisher","DOI":"10.1145\/3505170.3506730"},{"key":"ref104","author":"Pal","year":"2021","journal-title":"Scale-out packageless processing. University of California, Los Angeles"},{"key":"ref105","doi-asserted-by":"publisher","DOI":"10.1109\/DAC18074.2021.9586194"},{"key":"ref106","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2019.00042"},{"key":"ref107","doi-asserted-by":"publisher","DOI":"10.14569\/ijacsa.2024.0150610"},{"key":"ref108","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS.2019.00042"},{"key":"ref109","doi-asserted-by":"publisher","DOI":"10.1145\/3725843.3756090"},{"key":"ref110","first-page":"17573","article-title":"Poet: Training neural networks on tiny devices with integrated rematerialization and paging","volume-title":"International Conference on Machine Learning. PMLR","author":"Patil","year":"2022"},{"key":"ref111","doi-asserted-by":"publisher","DOI":"10.1109\/72.286892"},{"key":"ref112","doi-asserted-by":"publisher","DOI":"10.1109\/DSN.2017.43"},{"key":"ref113","article-title":"Training large neural networks with constant memory using a new execution algorithm","author":"Pudipeddi","year":"2020","journal-title":"arXiv preprint"},{"key":"ref114","article-title":"Qwen3-Next-80B-A3B-Instruct","volume-title":"Qwen","year":"2025"},{"key":"ref115","doi-asserted-by":"publisher","DOI":"10.1109\/SC41405.2020.00024"},{"key":"ref116","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3458817.3476205","article-title":"Zeroinfinity: Breaking the GPU memory wall for extreme scale deep learning","volume-title":"Proceedings of the international conference for high performance computing, networking, storage and analysis","author":"Rajbhandari","year":"2021"},{"key":"ref117","article-title":"FRED: Flexible reduction-distribution interconnect and communication implementation for wafer-scale distributed training of DNN models","author":"Rashidi","year":"2024","journal-title":"arXiv preprint"},{"key":"ref118","doi-asserted-by":"publisher","DOI":"10.1145\/3470496.3527382"},{"key":"ref119","doi-asserted-by":"publisher","DOI":"10.1145\/3394486.3406703"},{"key":"ref120","first-page":"551","article-title":"Zero-offload: Democratizing billionscale model training","volume-title":"2021 USENIX Annual Technical Conference (USENIX ATC 21)","author":"Ren","year":"2021"},{"key":"ref121","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA57654.2024.00066"},{"key":"ref122","doi-asserted-by":"publisher","DOI":"10.1016\/S0304-3975(00)00406-0"},{"key":"ref123","doi-asserted-by":"publisher","DOI":"10.1145\/3352460.3358302"},{"key":"ref124","doi-asserted-by":"publisher","DOI":"10.1109\/ECTC51687.2025.00005"},{"key":"ref125","article-title":"Megatron-LM: Training multi-billion parameter language models using model parallelism","author":"Shoeybi","year":"2019","journal-title":"arXiv preprint"},{"key":"ref126","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2018.00018"},{"key":"ref127","doi-asserted-by":"publisher","DOI":"10.3390\/en14020376"},{"key":"ref128","doi-asserted-by":"publisher","DOI":"10.1109\/TCYB.2020.2983860"},{"key":"ref129","doi-asserted-by":"publisher","DOI":"10.1145\/3620666.3651359"},{"key":"ref130","doi-asserted-by":"publisher","DOI":"10.1109\/HCS55958.2022.9895534"},{"key":"ref131","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA52012.2021.00083"},{"key":"ref132","doi-asserted-by":"publisher","DOI":"10.1145\/3613424.3614303"},{"key":"ref133","doi-asserted-by":"publisher","DOI":"10.1177\/1094342005051521"},{"key":"ref134","doi-asserted-by":"publisher","DOI":"10.1109\/HCS61935.2024.10665247"},{"key":"ref135","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA59077.2024.00024"},{"key":"ref136","article-title":"Llama: Open and efficient foundation language models","author":"Touvron","year":"2023","journal-title":"arXiv preprint"},{"key":"ref137","article-title":"Llama 2: Open foundation and fine-tuned chat models","author":"Touvron","year":"2023","journal-title":"arXiv preprint"},{"key":"ref138","doi-asserted-by":"publisher","DOI":"10.1109\/TVLSI.2024.3455332"},{"key":"ref139","doi-asserted-by":"publisher","DOI":"10.1109\/TCSI.2022.3229690"},{"key":"ref140","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO61859.2024.00093"},{"key":"ref141","doi-asserted-by":"publisher","DOI":"10.1109\/TSP.2021.3064781"},{"key":"ref142","article-title":"BitStopper: An efficient Transformer attention accelerator via stage-fusion and early termination","author":"Wang","year":"2025","journal-title":"arXiv preprint"},{"key":"ref143","article-title":"LAPA: Log-domain prediction-driven dynamic sparsity accelerator for Transformer model","author":"Wang","year":"2025","journal-title":"arXiv preprint"},{"key":"ref144","doi-asserted-by":"publisher","DOI":"10.1109\/TCSII.2025.3596228"},{"key":"ref145","doi-asserted-by":"publisher","DOI":"10.1145\/3725843.3756037"},{"key":"ref146","doi-asserted-by":"publisher","DOI":"10.1109\/TCSII.2021.3121081"},{"key":"ref147","doi-asserted-by":"publisher","DOI":"10.23919\/ICS.2024.3515003"},{"key":"ref148","doi-asserted-by":"publisher","DOI":"10.1109\/ICDSP.2018.8631556"},{"key":"ref149","doi-asserted-by":"publisher","DOI":"10.1145\/3302424.3303953"},{"key":"ref150","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA53966.2022.00091"},{"key":"ref151","doi-asserted-by":"publisher","DOI":"10.1145\/3669940.3707231"},{"key":"ref152","doi-asserted-by":"publisher","DOI":"10.1007\/978-981-95-1021-4_29"},{"key":"ref153","article-title":"Mask \/ Reticle","volume-title":"Wikichip","year":"2022"},{"key":"ref154","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO61859.2024.00068"},{"key":"ref155","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS57527.2023.00035"},{"key":"ref156","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO56248.2022.00096"},{"key":"ref157","doi-asserted-by":"publisher","DOI":"10.1145\/2783258.2783323"},{"key":"ref158","article-title":"GSPMD: General and scalable parallelization for ML computation graphs","author":"Xu","year":"2021","journal-title":"arXiv preprint"},{"key":"ref159","doi-asserted-by":"publisher","DOI":"10.1145\/3695053.3731101"},{"key":"ref160","doi-asserted-by":"publisher","DOI":"10.1007\/978-981-95-1021-4_3"},{"key":"ref161","doi-asserted-by":"publisher","DOI":"10.1145\/3695053.3731045"},{"key":"ref162","doi-asserted-by":"publisher","DOI":"10.1109\/ECTC51529.2024.00176"},{"key":"ref163","doi-asserted-by":"publisher","DOI":"10.1145\/3373376.3378514"},{"key":"ref164","doi-asserted-by":"publisher","DOI":"10.1145\/3695053.3731016"},{"key":"ref165","article-title":"Actions speak louder than words: Trillion-parameter sequential transducers for generative recommendations","author":"Zhai","year":"2024","journal-title":"arXiv preprint"},{"key":"ref166","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA59077.2024.00082"},{"key":"ref167","doi-asserted-by":"publisher","DOI":"10.1109\/DAC63849.2025.11132883"},{"key":"ref168","doi-asserted-by":"publisher","DOI":"10.1109\/JIOT.2023.3327392"},{"key":"ref169","doi-asserted-by":"publisher","DOI":"10.14778\/3611540.3611569"},{"key":"ref170","first-page":"559","article-title":"Alpa: Automating inter-and intraoperator parallelism for distributed deep learning","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Zheng","year":"2022"},{"key":"ref171","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA56546.2023.10071077"},{"key":"ref172","article-title":"Theseus: Towards highefficiency wafer-scale chip design space exploration for large language models","author":"Zhu","year":"2024","journal-title":"arXiv preprint"},{"key":"ref173","doi-asserted-by":"publisher","DOI":"10.1145\/3711818"}],"event":{"name":"2026 IEEE International Symposium on High Performance Computer Architecture (HPCA)","location":"Sydney, Australia","start":{"date-parts":[[2026,1,31]]},"end":{"date-parts":[[2026,2,4]]}},"container-title":["2026 IEEE International Symposium on High Performance Computer Architecture (HPCA)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11408404\/11408433\/11408457.pdf?arnumber=11408457","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,5]],"date-time":"2026-03-05T06:50:43Z","timestamp":1772693443000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11408457\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,1,31]]},"references-count":173,"URL":"https:\/\/doi.org\/10.1109\/hpca68181.2026.11408457","relation":{},"subject":[],"published":{"date-parts":[[2026,1,31]]}}}