{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,24]],"date-time":"2025-12-24T18:52:13Z","timestamp":1766602333675,"version":"3.48.0"},"reference-count":73,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"4","license":[{"start":{"date-parts":[[2025,12,1]],"date-time":"2025-12-01T00:00:00Z","timestamp":1764547200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"funder":[{"DOI":"10.13039\/100000028","name":"Semiconductor Research Corporation","doi-asserted-by":"crossref","id":[{"id":"10.13039\/100000028","id-type":"DOI","asserted-by":"crossref"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE J. Emerg. Sel. Topics Circuits Syst."],"published-print":{"date-parts":[[2025,12]]},"DOI":"10.1109\/jetcas.2025.3592677","type":"journal-article","created":{"date-parts":[[2025,7,25]],"date-time":"2025-07-25T17:57:11Z","timestamp":1753466231000},"page":"634-647","source":"Crossref","is-referenced-by-count":0,"title":["Democratizing Customization for ML at the Edge Through Hetero-Chiplet SiP Architectures"],"prefix":"10.1109","volume":"15","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-0780-0350","authenticated-orcid":false,"given":"Matthew Joseph","family":"Adiletta","sequence":"first","affiliation":[{"name":"Paulson School of Engineering and Applied Sciences, Harvard University, Cambridge, MA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5730-9904","authenticated-orcid":false,"given":"Gu-Yeon","family":"Wei","sequence":"additional","affiliation":[{"name":"Paulson School of Engineering and Applied Sciences, Harvard University, Cambridge, MA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0662-7889","authenticated-orcid":false,"given":"David","family":"Brooks","sequence":"additional","affiliation":[{"name":"Paulson School of Engineering and Applied Sciences, Harvard University, Cambridge, MA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"issue":"3","key":"ref1","doi-asserted-by":"crossref","first-page":"26","DOI":"10.1109\/IOTM.001.2200056","article-title":"Towards factory-scale edge robotic systems: Challenges and research directions","volume":"5","author":"Baxi","year":"2022","journal-title":"IEEE Internet Things Mag."},{"key":"ref2","article-title":"Self-driving cars: A survey","volume":"165","author":"Badu\u00e9","year":"2020","journal-title":"Expert Syst. Appl."},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/JIOT.2022.3176400"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/HCS52781.2021.9567097"},{"volume-title":"NVIDIA Jetson AGX Orin Series","year":"2022","author":"Karumbunathan","key":"ref5"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/ETFA.2015.7301601"},{"key":"ref7","first-page":"1","article-title":"Hardware for deep learning","volume-title":"Proc. IEEE Hot Chips Symp. (HCS)","author":"Dally"},{"key":"ref8","first-page":"395","article-title":"Chopin: Composing cost-effective custom chips with algorithmic chiplets","volume-title":"Proc. IEEE 39th Int. Conf. Comput. Design (ICCD)","author":"Ehrett"},{"key":"ref9","article-title":"Arithmetic intensity balancing convolution for hardware-aware efficient block design","author":"Choi","year":"2023","journal-title":"arXiv:2304.04016"},{"key":"ref10","first-page":"144","article-title":"A 7nm 4-core AI chip with 25.6TFLOPS hybrid FP8 training, 102.4TOPS INT4 inference and workload-aware throttling","volume-title":"IEEE Int. Solid-State Circuits Conf. (ISSCC) Dig. Tech. Papers","author":"Agrawal"},{"key":"ref11","first-page":"342","article-title":"22.9 a 12nm 18.1TFLOPs\/W sparse transformer processor with entropy-based early exit, mixed-precision predication and fine-grained power management","volume-title":"IEEE Int. Solid-State Circuits Conf. (ISSCC) Dig. Tech. Papers","author":"Tambe"},{"key":"ref12","first-page":"158","article-title":"A 25 mm2 SoC for IoT devices with 18ms noise-robust speech-to-text latency via Bayesian speech denoising and attention-based sequence-to-sequence DNN speech recognition in 16nm FinFET","volume-title":"IEEE Int. Solid-State Circuits Conf. (ISSCC) Dig. Tech. Papers","author":"Tambe"},{"issue":"1","key":"ref13","doi-asserted-by":"crossref","first-page":"227","DOI":"10.1109\/JSSC.2022.3213521","article-title":"An energy-efficient transformer processor exploiting dynamic weak relevances in global attention","volume":"58","author":"Wang","year":"2023","journal-title":"IEEE J. Solid-State Circuits"},{"issue":"5","key":"ref14","doi-asserted-by":"crossref","first-page":"78","DOI":"10.1109\/MM.2023.3295848","article-title":"The Intel programmable and integrated unified memory architecture graph analytics processor","volume":"43","author":"Aananthakrishnan","year":"2023","journal-title":"IEEE Micro"},{"key":"ref15","first-page":"48","article-title":"3.2 the A100 datacenter GPU and ampere architecture","volume-title":"IEEE Int. Solid-State Circuits Conf. (ISSCC) Dig. Tech. Papers","author":"Choquette"},{"volume-title":"AMD CDNA 3 Architecture","year":"2023","author":"Devices","key":"ref16"},{"key":"ref17","first-page":"1","article-title":"Embedded AI performances of Nvidia\u2019s Jetson orin SoC series","volume-title":"Proc. 17 \u00e8me Colloque Nat. Du GDR SOC2","author":"Archet"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1145\/3352460.3358302"},{"key":"ref19","first-page":"1013","article-title":"NN-baton: DNN workload orchestration and chiplet granularity exploration for multichip accelerators","volume-title":"Proc. ACM\/IEEE 48th Annu. Int. Symp. Comput. Archit. (ISCA)","author":"Tan"},{"key":"ref20","article-title":"Multi-objective hardware-mapping co-optimisation for multi-DNN workloads on chiplet-based accelerators","author":"Das","year":"2022","journal-title":"arXiv:2210.14657"},{"key":"ref21","article-title":"Gemini: Mapping and architecture co-exploration for large-scale DNN chiplet accelerators","author":"Cai","year":"2023","journal-title":"arXiv:2312.16436"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO61859.2024.00049"},{"key":"ref23","first-page":"57","article-title":"Pioneering chiplet technology and design for the AMD EPYC and Ryzen processor families: Industrial product","volume-title":"Proc. ACM\/IEEE 48th Annu. Int. Symp. Comput. Archit. (ISCA)","author":"Naffziger"},{"key":"ref24","first-page":"1","article-title":"Architecting for flexibility and value with next gen Intel Xeon processors","volume-title":"Proc. IEEE Hot Chips Symp. (HCS)","author":"Gianos"},{"key":"ref25","first-page":"1","article-title":"AMD next-generation FPGA built from chiplets","volume-title":"Proc. IEEE Hot Chips Symp. (HCS)","author":"Gaitonde"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1145\/1498765.1498785"},{"key":"ref27","article-title":"Efficiently scaling transformer inference","author":"Pope","year":"2022","journal-title":"arXiv:2211.05102"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS55109.2022.00023"},{"key":"ref29","article-title":"Deep residual learning for image recognition","author":"He","year":"2015","journal-title":"arXiv:1512.03385"},{"key":"ref30","article-title":"Very deep convolutional networks for large-scale image recognition","author":"Simonyan","year":"2014","journal-title":"arXiv:1409.1556"},{"key":"ref31","article-title":"Generative AI beyond LLMs: System implications of multi-modal generation","author":"Golden","year":"2023","journal-title":"arXiv:2312.14385"},{"key":"ref32","article-title":"High-resolution image synthesis with latent diffusion models","author":"Rombach","year":"2021","journal-title":"arXiv:2112.10752"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-24574-4_28"},{"key":"ref34","article-title":"Language models are few-shot learners","author":"Brown","year":"2020","journal-title":"arXiv:2005.14165"},{"key":"ref35","article-title":"FlashAttention: Fast and memory-efficient exact attention with IO-awareness","author":"Dao","year":"2022","journal-title":"arXiv:2205.14135"},{"key":"ref36","doi-asserted-by":"crossref","DOI":"10.1016\/j.cose.2022.102746","article-title":"Latest trends of security and privacy in recommender systems: A comprehensive review and future perspectives","volume":"118","author":"Himeur","year":"2022","journal-title":"Comput. Secur."},{"key":"ref37","first-page":"982","article-title":"DeepRecSys: A system for optimizing end-to-end at-scale neural recommendation inference","volume-title":"Proc. ACM\/IEEE 47th Annu. Int. Symp. Comput. Archit. (ISCA)","author":"Gupta"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1906.00091"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1609.02907"},{"key":"ref40","article-title":"Open graph benchmark: Datasets for machine learning on graphs","author":"Hu","year":"2020","journal-title":"arXiv:2005.00687"},{"volume-title":"Zero ASIC","year":"2025","key":"ref41"},{"issue":"6","key":"ref42","doi-asserted-by":"crossref","first-page":"4","DOI":"10.1109\/MM.2015.74","article-title":"Architectural simulators considered harmful","volume":"35","author":"Nowatzki","year":"2015","journal-title":"IEEE Micro"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1145\/2063384.2063454"},{"key":"ref44","first-page":"473","article-title":"Accel-sim: An extensible simulation framework for validated GPU modeling","volume-title":"Proc. ACM\/IEEE 47th Annu. Int. Symp. Comput. Archit. (ISCA)","author":"Khairy"},{"key":"ref45","first-page":"81","article-title":"ASTRA-SIM: Enabling SW\/HW co-design exploration for distributed DL training platforms","volume-title":"Proc. IEEE Int. Symp. Perform. Anal. Syst. Softw. (ISPASS)","author":"Rashidi"},{"key":"ref46","article-title":"MAD max beyond single-node: Enabling large machine learning model acceleration on distributed systems","author":"Hsia","year":"2023","journal-title":"arXiv:2310.02784"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1145\/2654822.2541967"},{"key":"ref48","first-page":"246","article-title":"A multi-mode 8K-MAC HW-Utilization-Aware neural processing unit with a unified multi-precision datapath in 4nm flagship mobile SoC","volume-title":"IEEE Int. Solid-State Circuits Conf. (ISSCC) Dig. Tech. Papers","volume":"65","author":"Park"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/ISSCC19947.2020.9063111"},{"key":"ref50","first-page":"130","article-title":"An 11.5TOPS\/W 1024-MAC butterfly structure dual-core sparsity-aware neural processing unit in 8nm flagship mobile SoC","volume-title":"IEEE Int. Solid-State Circuits Conf. (ISSCC) Dig. Tech. Papers","author":"Song"},{"key":"ref51","first-page":"146","article-title":"A 28nm 12.1TOPS\/W dual-mode CNN processor using effective-weight-based convolution and error-compensation-based prediction","volume-title":"IEEE Int. Solid-State Circuits Conf. (ISSCC) Dig. Tech. Papers","author":"Mo"},{"key":"ref52","first-page":"136","article-title":"A 12nm programmable convolution-efficient neural-processing-unit chip achieving 825TOPS","volume-title":"IEEE Int. Solid-State Circuits Conf. (ISSCC) Dig. Tech. Papers","author":"Jiao"},{"key":"ref53","first-page":"422","article-title":"An 8.09TOPS\/W neural engine leveraging bit-sparsified sign-magnitude multiplications and dual adder trees","volume-title":"IEEE Int. Solid-State Circuits Conf. (ISSCC) Dig. Tech. Papers","author":"An"},{"key":"ref54","first-page":"1","article-title":"A 28nm 11.2TOPS\/W hardware-utilization-aware neural-network accelerator with dynamic dataflow","volume-title":"IEEE Int. Solid-State Circuits Conf. (ISSCC) Dig. Tech. Papers)","author":"Du"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/islped58423.2023.10244348"},{"key":"ref56","first-page":"692","article-title":"ELSA: Hardware\u2013software co-design for efficient, lightweight self-attention mechanism in neural networks","volume-title":"Proc. ACM\/IEEE 48th Annu. Int. Symp. Comput. Archit. (ISCA)","author":"Ham"},{"key":"ref57","first-page":"1","article-title":"A 28nm 49.7TOPS\/W sparse transformer processor with random-projection-based speculation, multi-stationary dataflow, and redundant partial product elimination","volume-title":"Proc. IEEE Asian Solid-State Circuits Conf. (A-SSCC)","author":"Qin"},{"key":"ref58","first-page":"16","article-title":"A 17\u201395.6 TOPS\/W deep learning inference accelerator with per-vector scaled 4-bit quantization for transformers in 5nm","volume-title":"Proc. IEEE Symp. VLSI Technol. Circuits","author":"Keller"},{"key":"ref59","first-page":"1","article-title":"Novel graph processor architecture, prototype system, and results","volume-title":"Proc. IEEE High Perform. Extreme Comput. Conf. (HPEC)","author":"Song"},{"issue":"9","key":"ref60","doi-asserted-by":"crossref","first-page":"1511","DOI":"10.1109\/TC.2020.3014632","article-title":"EnGN: A high-throughput and energy-efficient accelerator for large graph neural networks","volume":"70","author":"Liang","year":"2021","journal-title":"IEEE Trans. Comput."},{"key":"ref61","first-page":"15","article-title":"HyGCN: A GCN accelerator with hybrid architecture","volume-title":"Proc. IEEE Int. Symp. High Perform. Comput. Archit. (HPCA)","author":"Yan"},{"key":"ref62","first-page":"922","article-title":"AWB-GCN: A graph convolutional network accelerator with runtime workload rebalancing","volume-title":"Proc. 53rd Annu. IEEE\/ACM Int. Symp. Microarchitecture (MICRO)","author":"Geng"},{"key":"ref63","first-page":"775","article-title":"GCNAX: A flexible and energy-efficient accelerator for graph convolutional neural networks","volume-title":"Proc. IEEE Int. Symp. High-Perform. Comput. Archit. (HPCA)","author":"Li"},{"key":"ref64","article-title":"Rubik: A hierarchical architecture for efficient graph learning","author":"Chen","year":"2020","journal-title":"arXiv:2009.12495"},{"key":"ref65","first-page":"1","article-title":"Hardware acceleration of graph neural networks","volume-title":"Proc. 57th ACM\/IEEE Design Autom. Conf. (DAC)","author":"Auten"},{"key":"ref66","first-page":"1","article-title":"AMD Instinct MI200 series accelerator and node architectures","volume-title":"Proc. IEEE Hot Chips 34 Symp. (HCS)","author":"Smith"},{"issue":"9","key":"ref67","doi-asserted-by":"crossref","first-page":"1423","DOI":"10.1109\/TCPMT.2022.3207195","article-title":"Universal chiplet interconnect express (UCIe): An open industry standard for innovations with chiplets at package level","volume":"12","author":"Das Sharma","year":"2022","journal-title":"IEEE Trans. Compon., Packag., Manuf. Technol."},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1109\/VLSITechnologyandCir46783.2024.10631545"},{"key":"ref69","first-page":"1","article-title":"Hetero-mark, a benchmark suite for CPU-GPU collaborative computing","volume-title":"Proc. IEEE Int. Symp. Workload Characterization (IISWC)","author":"Sun"},{"key":"ref70","first-page":"1","article-title":"New 3rd gen Intel Xeon scalable processor (codename: Ice lake-SP)","volume-title":"Proc. IEEE Hot Chips 32 Symp. (HCS)","author":"Papazian"},{"volume-title":"Ethos-U65 NPU-Efficient AI With Double ML Performance","year":"2025","key":"ref71"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1145\/3476999"},{"issue":"4","key":"ref73","doi-asserted-by":"crossref","first-page":"1062","DOI":"10.1109\/TVLSI.2020.2968904","article-title":"Design space exploration for chiplet-assembly-based processors","volume":"28","author":"Pal","year":"2020","journal-title":"IEEE Trans. Very Large Scale Integr. (VLSI) Syst."}],"container-title":["IEEE Journal on Emerging and Selected Topics in Circuits and Systems"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/5503868\/11313699\/11096615.pdf?arnumber=11096615","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,24]],"date-time":"2025-12-24T18:46:30Z","timestamp":1766601990000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11096615\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,12]]},"references-count":73,"journal-issue":{"issue":"4"},"URL":"https:\/\/doi.org\/10.1109\/jetcas.2025.3592677","relation":{},"ISSN":["2156-3357","2156-3365"],"issn-type":[{"type":"print","value":"2156-3357"},{"type":"electronic","value":"2156-3365"}],"subject":[],"published":{"date-parts":[[2025,12]]}}}