{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,19]],"date-time":"2026-06-19T16:37:02Z","timestamp":1781887022848,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":108,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,6,21]]},"DOI":"10.1145\/3695053.3731045","type":"proceedings-article","created":{"date-parts":[[2025,6,20]],"date-time":"2025-06-20T16:43:11Z","timestamp":1750437791000},"page":"49-64","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":12,"title":["PD Constraint-aware Physical\/Logical Topology Co-Design for Network on Wafer"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-2221-4706","authenticated-orcid":false,"given":"Qize","family":"Yang","sequence":"first","affiliation":[{"name":"School of Integrated Circuits, BNRist, Tsinghua University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-3501-3148","authenticated-orcid":false,"given":"Taiquan","family":"Wei","sequence":"additional","affiliation":[{"name":"School of Integrated Circuits, BNRist, Tsinghua University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-0282-6039","authenticated-orcid":false,"given":"Sihan","family":"Guan","sequence":"additional","affiliation":[{"name":"School of Integrated Circuits, BNRist, Tsinghua University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-2650-6506","authenticated-orcid":false,"given":"Chengran","family":"Li","sequence":"additional","affiliation":[{"name":"School of Integrated Circuits, BNRist, Tsinghua University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-6972-9235","authenticated-orcid":false,"given":"Haoran","family":"Shang","sequence":"additional","affiliation":[{"name":"School of Integrated Circuits, BNRist, Tsinghua University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8666-8463","authenticated-orcid":false,"given":"Jinyi","family":"Deng","sequence":"additional","affiliation":[{"name":"School of Integrated Circuits, BNRist, Tsinghua University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9763-8208","authenticated-orcid":false,"given":"Huizheng","family":"Wang","sequence":"additional","affiliation":[{"name":"School of Integrated Circuits, BNRist, Tsinghua University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6218-4659","authenticated-orcid":false,"given":"Chao","family":"Li","sequence":"additional","affiliation":[{"name":"SJTU, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-6971-6572","authenticated-orcid":false,"given":"Lei","family":"Wang","sequence":"additional","affiliation":[{"name":"Shanghai Artificial Intelligence Laboratory, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-9480-728X","authenticated-orcid":false,"given":"Yan","family":"Zhang","sequence":"additional","affiliation":[{"name":"Shanghai Artificial Intelligence Laboratory, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8438-8588","authenticated-orcid":false,"given":"Shouyi","family":"Yin","sequence":"additional","affiliation":[{"name":"School of Integrated Circuits, BNRist, Tsinghua University, Beijing, China and Shanghai Artificial Intelligence Laboratory, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6942-4395","authenticated-orcid":false,"given":"Yang","family":"Hu","sequence":"additional","affiliation":[{"name":"School of Integrated Circuits, BNRist, Tsinghua University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2025,6,20]]},"reference":[{"key":"e_1_3_3_1_2_2","unstructured":"Josh Achiam Steven Adler Sandhini Agarwal Lama Ahmad Ilge Akkaya Florencia\u00a0Leoni Aleman Diogo Almeida Janko Altenschmidt Sam Altman Shyamal Anadkat et\u00a0al. 2023. Gpt-4 technical report. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2303.08774 (2023)."},{"key":"e_1_3_3_1_3_2","doi-asserted-by":"crossref","unstructured":"Akhil Arunkumar Evgeny Bolotin Benjamin Cho Ugljesa Milic Eiman Ebrahimi Oreste Villa Aamer Jaleel Carole-Jean Wu and David Nellans. 2017. MCM-GPU: Multi-chip-module GPUs for continued performance scalability. ACM SIGARCH Computer Architecture News 45 2 (2017) 320\u2013332.","DOI":"10.1145\/3140659.3080231"},{"key":"e_1_3_3_1_4_2","unstructured":"Russel Aubusson. 1979. Wafer-scale integration of semiconductor memory.Ph.\u00a0D. Dissertation. Middlesex Polytechnic."},{"key":"e_1_3_3_1_5_2","doi-asserted-by":"crossref","unstructured":"Vivek Bakshi. 2009. EUV lithography. (2009).","DOI":"10.1117\/3.769214"},{"key":"e_1_3_3_1_6_2","doi-asserted-by":"publisher","DOI":"10.1109\/RTAS48715.2020.00007"},{"key":"e_1_3_3_1_7_2","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2014.34"},{"key":"e_1_3_3_1_8_2","doi-asserted-by":"publisher","DOI":"10.1109\/DAC18072.2020.9218539"},{"key":"e_1_3_3_1_9_2","unstructured":"Tom\u00a0B Brown. 2020. Language models are few-shot learners. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2005.14165 (2020)."},{"key":"e_1_3_3_1_10_2","doi-asserted-by":"crossref","unstructured":"James\u00a0A Burns Brian\u00a0F Aull Chenson\u00a0K Chen Chang-Lee Chen Craig\u00a0L Keast Jeffrey\u00a0M Knecht Vyshanavi Suntharalingam Keith Warner Peter\u00a0W Wyatt and D-RW Yost. 2006. A wafer-scale 3-D circuit integration technology. IEEE Transactions on Electron Devices 53 10 (2006) 2507\u20132516.","DOI":"10.1109\/TED.2006.882043"},{"key":"e_1_3_3_1_11_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA57654.2024.00022"},{"key":"e_1_3_3_1_12_2","doi-asserted-by":"crossref","unstructured":"Jose\u00a0M Camara Miquel Moreto Enrique Vallejo Ramon Beivide Jose Miguel-Alonso Carmen Mart\u00ednez and Javier Navaridas. 2010. Twisted torus topologies for enhanced interconnection networks. IEEE Transactions on Parallel and Distributed Systems 21 12 (2010) 1765\u20131778.","DOI":"10.1109\/TPDS.2010.30"},{"key":"e_1_3_3_1_13_2","doi-asserted-by":"crossref","unstructured":"Richard\u00a0O Carlson and Constantine\u00a0A Neugebauer. 1986. Future trends in wafer scale integration. Proc. IEEE 74 12 (1986) 1741\u20131752.","DOI":"10.1109\/PROC.1986.13689"},{"key":"e_1_3_3_1_14_2","unstructured":"Shubhangi\u00a0D Chawade Mahendra\u00a0A Gaikwad and Rajendra\u00a0M Patrikar. 2012. Review of XY routing algorithm for network-on-chip architecture. International Journal of Computer Applications 43 21 (2012) 975\u20138887."},{"key":"e_1_3_3_1_15_2","doi-asserted-by":"crossref","unstructured":"Shixin Chen Shanyi Li Zhen Zhuang Su Zheng Zheng Liang Tsung-Yi Ho Bei Yu and Alberto\u00a0L Sangiovanni-Vincentelli. 2023. Floorplet: Performance-Aware Floorplan Framework for Chiplet Integration. IEEE Transactions on Computer-Aided Design of Integrated Circuits and Systems (2023).","DOI":"10.1109\/TCAD.2023.3347302"},{"key":"e_1_3_3_1_16_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA59077.2024.00025"},{"key":"e_1_3_3_1_17_2","doi-asserted-by":"publisher","DOI":"10.23919\/VLSIT.2017.7998198"},{"key":"e_1_3_3_1_18_2","doi-asserted-by":"publisher","DOI":"10.1109\/ECTC32696.2021.00033"},{"key":"e_1_3_3_1_19_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA56546.2023.10071117"},{"key":"e_1_3_3_1_20_2","doi-asserted-by":"crossref","unstructured":"Jack Choquette. 2023. Nvidia hopper h100 gpu: Scaling performance. IEEE Micro 43 3 (2023) 9\u201317.","DOI":"10.1109\/MM.2023.3256796"},{"key":"e_1_3_3_1_21_2","doi-asserted-by":"publisher","DOI":"10.1109\/HCS49909.2020.9220622"},{"key":"e_1_3_3_1_22_2","doi-asserted-by":"crossref","unstructured":"Jack Choquette Wishwesh Gandhi Olivier Giroux Nick Stam and Ronny Krashinsky. 2021. Nvidia a100 tensor core gpu: Performance and innovation. IEEE Micro 41 2 (2021) 29\u201335.","DOI":"10.1109\/MM.2021.3061394"},{"key":"e_1_3_3_1_23_2","unstructured":"Aakanksha Chowdhery Sharan Narang Jacob Devlin Maarten Bosma Gaurav Mishra Adam Roberts Paul Barham Hyung\u00a0Won Chung Charles Sutton Sebastian Gehrmann et\u00a0al. 2023. Palm: Scaling language modeling with pathways. Journal of Machine Learning Research 24 240 (2023) 1\u2013113."},{"key":"e_1_3_3_1_24_2","doi-asserted-by":"publisher","DOI":"10.1109\/ECTC32862.2020.00013"},{"key":"e_1_3_3_1_25_2","doi-asserted-by":"crossref","unstructured":"Ayse Coskun Furkan Eris Ajay Joshi Andrew\u00a0B Kahng Yenai Ma Aditya Narayan and Vaishnav Srinivas. 2020. Cross-layer co-optimization of network design and chiplet placement in 2.5-D systems. IEEE Transactions on Computer-Aided Design of Integrated Circuits and Systems 39 12 (2020) 5183\u20135196.","DOI":"10.1109\/TCAD.2020.2970019"},{"key":"e_1_3_3_1_26_2","doi-asserted-by":"crossref","unstructured":"Sajed Dadashi Midia Reshadi Akram Reza and Ahmad Khademzadeh. 2019. An expandable topology with low wiring congestion for silicon interposer-based network-on-chip systems. Transactions on Emerging Telecommunications Technologies 30 12 (2019) e3747.","DOI":"10.1002\/ett.3747"},{"key":"e_1_3_3_1_27_2","doi-asserted-by":"crossref","unstructured":"Dally and Seitz. 1987. Deadlock-free message routing in multiprocessor interconnection networks. IEEE Transactions on computers 100 5 (1987) 547\u2013553.","DOI":"10.1109\/TC.1987.1676939"},{"key":"e_1_3_3_1_28_2","first-page":"7480","volume-title":"International Conference on Machine Learning","author":"Dehghani Mostafa","year":"2023","unstructured":"Mostafa Dehghani, Josip Djolonga, Basil Mustafa, Piotr Padlewski, Jonathan Heek, Justin Gilmer, Andreas\u00a0Peter Steiner, Mathilde Caron, Robert Geirhos, Ibrahim Alabdulmohsin, et\u00a0al. 2023. Scaling vision transformers to 22 billion parameters. In International Conference on Machine Learning. PMLR, 7480\u20137512."},{"key":"e_1_3_3_1_29_2","unstructured":"Jacob Devlin. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1810.04805 (2018)."},{"key":"e_1_3_3_1_30_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10447605"},{"key":"e_1_3_3_1_31_2","doi-asserted-by":"crossref","unstructured":"Yinxiao Feng and Kaisheng Ma. 2024. Switch-Less Dragonfly on Wafers: A Scalable Interconnection Architecture based on Wafer-Scale Integration. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2407.10290 (2024).","DOI":"10.1109\/SC41406.2024.00102"},{"key":"e_1_3_3_1_32_2","doi-asserted-by":"crossref","unstructured":"Stefan Feuerriegel Jochen Hartmann Christian Janiesch and Patrick Zschech. 2024. Generative ai. Business & Information Systems Engineering 66 1 (2024) 111\u2013126.","DOI":"10.1007\/s12599-023-00834-7"},{"key":"e_1_3_3_1_33_2","doi-asserted-by":"crossref","unstructured":"Nan Fu Yanxiang Liu Xiaolong Ma and Zanfeng Chen. 2019. EUV lithography: state-of-the-art review. J. Microelectron. Manuf 2 2 (2019) 1\u20136.","DOI":"10.33079\/jomm.19020202"},{"key":"e_1_3_3_1_34_2","doi-asserted-by":"crossref","unstructured":"Fiona Fui-Hoon\u00a0Nah Ruilin Zheng Jingyuan Cai Keng Siau and Langtao Chen. 2023. Generative AI and ChatGPT: Applications challenges and AI-human collaboration. 277\u2013304\u00a0pages.","DOI":"10.1080\/15228053.2023.2233814"},{"key":"e_1_3_3_1_35_2","doi-asserted-by":"publisher","DOI":"10.1109\/SC41404.2022.00016"},{"key":"e_1_3_3_1_36_2","doi-asserted-by":"publisher","DOI":"10.1109\/SC41405.2020.00082"},{"key":"e_1_3_3_1_37_2","doi-asserted-by":"publisher","DOI":"10.1109\/ECTC51909.2023.00108"},{"key":"e_1_3_3_1_38_2","doi-asserted-by":"crossref","unstructured":"Yang Hu Xinhan Lin Huizheng Wang Zhen He Xingmao Yu Jiahao Zhang Qize Yang Zheng Xu Sihan Guan Jiahao Fang et\u00a0al. 2024. Wafer-Scale Computing: Advancements Challenges and Future Perspectives [Feature]. IEEE Circuits and Systems Magazine 24 1 (2024) 52\u201381.","DOI":"10.1109\/MCAS.2024.3349669"},{"key":"e_1_3_3_1_39_2","doi-asserted-by":"publisher","DOI":"10.1145\/3037697.3037713"},{"key":"e_1_3_3_1_40_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA52012.2021.00023"},{"key":"e_1_3_3_1_41_2","unstructured":"Yanping Huang Youlong Cheng Ankur Bapna Orhan Firat Dehao Chen Mia Chen HyoukJoong Lee Jiquan Ngiam Quoc\u00a0V Le Yonghui Wu et\u00a0al. 2019. Gpipe: Efficient training of giant neural networks using pipeline parallelism. Advances in neural information processing systems 32 (2019)."},{"key":"e_1_3_3_1_42_2","doi-asserted-by":"publisher","DOI":"10.1109\/DAC56929.2023.10248006"},{"key":"e_1_3_3_1_43_2","doi-asserted-by":"publisher","DOI":"10.1109\/DAC56929.2023.10247754"},{"key":"e_1_3_3_1_44_2","doi-asserted-by":"crossref","unstructured":"Subramanian\u00a0S Iyer S Jangam and Boris Vaisband. 2019. Silicon interconnect fabric: A versatile heterogeneous integration platform for AI systems. IBM Journal of Research and Development 63 6 (2019) 5\u20131.","DOI":"10.1147\/JRD.2019.2940427"},{"key":"e_1_3_3_1_45_2","unstructured":"Sam\u00a0Ade Jacobs Masahiro Tanaka Chengming Zhang Minjia Zhang Shuaiwen\u00a0Leon Song Samyam Rajbhandari and Yuxiong He. 2023. Deepspeed ulysses: System optimizations for enabling training of extreme long sequence transformer models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2309.14509 (2023)."},{"key":"e_1_3_3_1_46_2","doi-asserted-by":"publisher","DOI":"10.1109\/ECTC.2018.00197"},{"key":"e_1_3_3_1_47_2","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2014.61"},{"key":"e_1_3_3_1_48_2","first-page":"1","volume-title":"Proceedings of the 39th International Conference on Computer-Aided Design","author":"Jiang Bentian","year":"2020","unstructured":"Bentian Jiang, Jingsong Chen, Jinwei Liu, Lixin Liu, Fangzhou Wang, Xiaopeng Zhang, and Evangeline\u00a0FY Young. 2020. CU. POKer: placing DNNs on wafer-scale AI accelerator with optimal kernel sizing. In Proceedings of the 39th International Conference on Computer-Aided Design. 1\u20139."},{"key":"e_1_3_3_1_49_2","doi-asserted-by":"crossref","unstructured":"Yuho Jin Eun\u00a0Jung Kim and Timothy\u00a0Mark Pinkston. 2011. Communication-aware globally-coordinated on-chip networks. IEEE Transactions on Parallel and Distributed Systems 23 2 (2011) 242\u2013254.","DOI":"10.1109\/TPDS.2011.164"},{"key":"e_1_3_3_1_50_2","doi-asserted-by":"publisher","DOI":"10.1109\/SCC57168.2023.00009"},{"key":"e_1_3_3_1_51_2","doi-asserted-by":"publisher","DOI":"10.1145\/3579371.3589350"},{"key":"e_1_3_3_1_52_2","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2007.15"},{"key":"e_1_3_3_1_53_2","doi-asserted-by":"crossref","unstructured":"John Kim Wiliam\u00a0J Dally Steve Scott and Dennis Abts. 2008. Technology-driven highly-scalable dragonfly topology. ACM SIGARCH Computer Architecture News 36 3 (2008) 77\u201388.","DOI":"10.1145\/1394608.1382129"},{"key":"e_1_3_3_1_54_2","doi-asserted-by":"crossref","unstructured":"Christophe Kopp Stephane Bernabe Badhise\u00a0Ben Bakir Jean-Marc Fedeli Regis Orobtchouk Franz Schrank Henri Porte Lars Zimmermann and Tolga Tekin. 2010. Silicon photonic circuits: on-CMOS integration fiber optical coupling and packaging. IEEE Journal of selected topics in quantum electronics 17 3 (2010) 498\u2013509.","DOI":"10.1109\/JSTQE.2010.2071855"},{"key":"e_1_3_3_1_55_2","doi-asserted-by":"crossref","unstructured":"Richard\u00a0E Korf. 1985. Depth-first iterative-deepening: An optimal admissible tree search. Artificial intelligence 27 1 (1985) 97\u2013109.","DOI":"10.1016\/0004-3702(85)90084-0"},{"key":"e_1_3_3_1_56_2","doi-asserted-by":"publisher","DOI":"10.1109\/SC41404.2022.00017"},{"key":"e_1_3_3_1_57_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA57654.2024.00069"},{"key":"e_1_3_3_1_58_2","doi-asserted-by":"crossref","unstructured":"Gary Lauterbach. 2021. The path to successful wafer-scale integration: The cerebras story. IEEE Micro 41 6 (2021) 52\u201357.","DOI":"10.1109\/MM.2021.3112025"},{"key":"e_1_3_3_1_59_2","doi-asserted-by":"crossref","unstructured":"Leighton and Leiserson. 1985. Wafer-scale integration of systolic arrays. IEEE Transactions on computers 100 5 (1985) 448\u2013461.","DOI":"10.1109\/TC.1985.1676584"},{"key":"e_1_3_3_1_60_2","unstructured":"Dmitry Lepikhin HyoukJoong Lee Yuanzhong Xu Dehao Chen Orhan Firat Yanping Huang Maxim Krikun Noam Shazeer and Zhifeng Chen. 2020. Gshard: Scaling giant models with conditional computation and automatic sharding. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2006.16668 (2020)."},{"key":"e_1_3_3_1_61_2","doi-asserted-by":"publisher","DOI":"10.1145\/2540708.2540736"},{"key":"e_1_3_3_1_62_2","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU57964.2023.10389732"},{"key":"e_1_3_3_1_63_2","doi-asserted-by":"publisher","DOI":"10.1109\/HCS55958.2022.9895479"},{"key":"e_1_3_3_1_64_2","doi-asserted-by":"crossref","unstructured":"Sean Lie. 2023. Cerebras architecture deep dive: First look inside the hardware\/software co-design for deep learning. IEEE Micro 43 3 (2023) 18\u201330.","DOI":"10.1109\/MM.2023.3256384"},{"key":"e_1_3_3_1_65_2","doi-asserted-by":"crossref","unstructured":"Xiao Lin Shuzhou Sun Wei Huang Bin Sheng Ping Li and David\u00a0Dagan Feng. 2021. EAPT: efficient attention pyramid transformer for image processing. IEEE Transactions on Multimedia 25 (2021) 50\u201361.","DOI":"10.1109\/TMM.2021.3120873"},{"key":"e_1_3_3_1_66_2","unstructured":"Aixin Liu Bei Feng Bing Xue Bingxuan Wang Bochao Wu Chengda Lu Chenggang Zhao Chengqi Deng Chenyu Zhang Chong Ruan et\u00a0al. 2024. Deepseek-v3 technical report. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2412.19437 (2024)."},{"key":"e_1_3_3_1_67_2","doi-asserted-by":"publisher","DOI":"10.1145\/3581784.3627042"},{"key":"e_1_3_3_1_68_2","doi-asserted-by":"publisher","DOI":"10.1145\/3625549.3658693"},{"key":"e_1_3_3_1_69_2","unstructured":"Canhui Luo Zhouxing Su and Zhipeng L\u00fc. 2023. MS-CLS: An Effective Partitioning and Placement Metaheuristic for Wafer-Scale Physics Modeling. IEEE Transactions on Emerging Topics in Computational Intelligence (2023)."},{"key":"e_1_3_3_1_70_2","unstructured":"Marcelo Orenes-Vera Esin Tureci Margaret Martonosi and David Wentzlaff. 2023. DCRA: A distributed chiplet-based reconfigurable architecture for irregular applications. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2311.15443 (2023)."},{"key":"e_1_3_3_1_71_2","doi-asserted-by":"publisher","DOI":"10.1145\/3414622.3431906"},{"key":"e_1_3_3_1_72_2","doi-asserted-by":"publisher","DOI":"10.1109\/DAC18074.2021.9586194"},{"key":"e_1_3_3_1_73_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2019.00042"},{"key":"e_1_3_3_1_74_2","unstructured":"Sunyoung Park. 2021. High Bandwidth Interposer Switch (HBI-S) Topology in Modular System on Chip. Ph.\u00a0D. Dissertation."},{"key":"e_1_3_3_1_75_2","first-page":"4055","volume-title":"International conference on machine learning","author":"Parmar Niki","year":"2018","unstructured":"Niki Parmar, Ashish Vaswani, Jakob Uszkoreit, Lukasz Kaiser, Noam Shazeer, Alexander Ku, and Dustin Tran. 2018. Image transformer. In International conference on machine learning. PMLR, 4055\u20134064."},{"key":"e_1_3_3_1_76_2","doi-asserted-by":"publisher","DOI":"10.1145\/3579371.3589057"},{"key":"e_1_3_3_1_77_2","unstructured":"Colin Raffel Noam Shazeer Adam Roberts Katherine Lee Sharan Narang Michael Matena Yanqi Zhou Wei Li and Peter\u00a0J Liu. 2020. Exploring the limits of transfer learning with a unified text-to-text transformer. Journal of machine learning research 21 140 (2020) 1\u201367."},{"key":"e_1_3_3_1_78_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS48437.2020.00018"},{"key":"e_1_3_3_1_79_2","unstructured":"Saeed Rashidi William Won Sudarshan Srinivasan Puneet Gupta and Tushar Krishna. 2024. FRED: Flexible REduction-Distribution Interconnect and Communication Implementation for Wafer-Scale Distributed Training of DNN Models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2406.19580 (2024)."},{"key":"e_1_3_3_1_80_2","doi-asserted-by":"crossref","unstructured":"Peter Sanders Jochen Speck and Jesper\u00a0Larsson Tr\u00e4ff. 2009. Two-tree algorithms for full bandwidth broadcast reduction and scan. Parallel Comput. 35 12 (2009) 581\u2013594.","DOI":"10.1016\/j.parco.2009.09.001"},{"key":"e_1_3_3_1_81_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCAS.2010.5536970"},{"key":"e_1_3_3_1_82_2","doi-asserted-by":"crossref","unstructured":"Tom Schram Surajit Sutar Iuliana Radu and Inge Asselberghs. 2022. Challenges of wafer-scale integration of 2D semiconductors for high-performance transistor circuits. Advanced Materials 34 48 (2022) 2109796.","DOI":"10.1002\/adma.202109796"},{"key":"e_1_3_3_1_83_2","doi-asserted-by":"publisher","DOI":"10.1145\/3229574.3229577"},{"key":"e_1_3_3_1_84_2","doi-asserted-by":"publisher","DOI":"10.1145\/3352460.3358302"},{"key":"e_1_3_3_1_85_2","doi-asserted-by":"crossref","unstructured":"Debendra\u00a0Das Sharma Gerald Pasdast Zhiguo Qian and Kemal Aygun. 2022. Universal chiplet interconnect express (UCIe): An open industry standard for innovations with chiplets at package level. IEEE Transactions on Components Packaging and Manufacturing Technology 12 9 (2022) 1423\u20131431.","DOI":"10.1109\/TCPMT.2022.3207195"},{"key":"e_1_3_3_1_86_2","unstructured":"Mingcong Song Xinru Tang Fengfan Hou Jing Li Wei Wei Yipeng Ma Runqiu Xiao Hongjie Si Dingcheng Jiang Shouyi Yin et\u00a0al. 2024. Tackling the dynamicity in a production llm serving system with sota optimizations via hybrid prefill\/decode\/verify scheduling on efficient meta-kernels. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2412.18106 (2024)."},{"key":"e_1_3_3_1_87_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2018.00068"},{"key":"e_1_3_3_1_88_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2018.00018"},{"key":"e_1_3_3_1_89_2","doi-asserted-by":"publisher","DOI":"10.1109\/SLIP.2019.8771333"},{"key":"e_1_3_3_1_90_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCAD.2017.8203849"},{"key":"e_1_3_3_1_91_2","doi-asserted-by":"crossref","unstructured":"Emil Talpes Debjit\u00a0Das Sarma Doug Williams Sahil Arora Thomas Kunjan Benjamin Floering Ankit Jalote Christopher Hsiong Chandrasekhar Poorna Vaidehi Samant et\u00a0al. 2023. The microarchitecture of dojo tesla\u2019s exa-scale computer. IEEE Micro 43 3 (2023) 31\u201339.","DOI":"10.1109\/MM.2023.3258906"},{"key":"e_1_3_3_1_92_2","doi-asserted-by":"publisher","DOI":"10.1109\/HCS55958.2022.9895534"},{"key":"e_1_3_3_1_93_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA52012.2021.00083"},{"key":"e_1_3_3_1_94_2","unstructured":"Hugo Touvron Thibaut Lavril Gautier Izacard Xavier Martinet Marie-Anne Lachaux Timoth\u00e9e Lacroix Baptiste Rozi\u00e8re Naman Goyal Eric Hambro Faisal Azhar et\u00a0al. 2023. LLaMA: open and efficient foundation language models. arXiv. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2302.13971 (2023)."},{"key":"e_1_3_3_1_95_2","doi-asserted-by":"publisher","DOI":"10.1109\/CCGRID.2019.00057"},{"key":"e_1_3_3_1_96_2","doi-asserted-by":"publisher","DOI":"10.1109\/CCGRID.2019.00057"},{"key":"e_1_3_3_1_97_2","doi-asserted-by":"crossref","unstructured":"Mary\u00a0K Vernon and Udi Manber. 1988. Distributed round-robin and first-come first-serve protocols and their applications to multiprocessor bus arbitration. ACM SIGARCH Computer Architecture News 16 2 (1988) 269\u2013279.","DOI":"10.1145\/633625.52431"},{"key":"e_1_3_3_1_98_2","doi-asserted-by":"crossref","unstructured":"Mark Wade Erik Anderson Shahab Ardalan Pavan Bhargava Sidney Buchbinder Michael\u00a0L Davenport John Fini Haiwei Lu Chen Li Roy Meade et\u00a0al. 2020. TeraPHY: a chiplet technology for low-power high-bandwidth in-package optical I\/O. IEEE Micro 40 2 (2020) 63\u201371.","DOI":"10.1109\/MM.2020.2976067"},{"key":"e_1_3_3_1_99_2","unstructured":"Zhiquan Wan Zhipeng Cao Shunbin Li Peijie Li Qingwen Deng Weihao Wang Kun Zhang Guandong Liu Ruyun Zhang and Qinrang Liu. 2024. Architectural Exploration for Waferscale Switching System. IEEE Transactions on Very Large Scale Integration (VLSI) Systems (2024)."},{"key":"e_1_3_3_1_100_2","doi-asserted-by":"crossref","unstructured":"Ronald Williams and Ogden Marsh. 1993. Future WSI technology: stacked monolithic WSI. IEEE transactions on components hybrids and manufacturing technology 16 7 (1993) 610\u2013614.","DOI":"10.1109\/33.257874"},{"key":"e_1_3_3_1_101_2","unstructured":"William Won Midhilesh Elavazhagan Sudarshan Srinivasan Ajaya Durg Samvit Kaul Swati Gupta and Tushar Krishna. 2023. TACOS: Topology-Aware Collective Algorithm Synthesizer for Distributed Machine Learning. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2304.05301 (2023)."},{"key":"e_1_3_3_1_102_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS57527.2023.00035"},{"key":"e_1_3_3_1_103_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS61541.2024.00028"},{"key":"e_1_3_3_1_104_2","doi-asserted-by":"crossref","unstructured":"Jiantao Wu Wan-Ping Lee Alistair Ward Jerilyn\u00a0A Walker Miriam\u00a0K Konkel Mark\u00a0A Batzer and Gabor\u00a0T Marth. 2014. Tangram: a comprehensive toolbox for mobile element insertion detection. BMC genomics 15 (2014) 1\u201315.","DOI":"10.1186\/1471-2164-15-795"},{"key":"e_1_3_3_1_105_2","doi-asserted-by":"crossref","first-page":"121","DOI":"10.1145\/3489517.3530428","volume-title":"The 59th ACM\/IEEE Design Automation Conference, San Francisco, USA","author":"Yinxiao FENG","year":"2022","unstructured":"FENG Yinxiao and MA Kaisheng. 2022. Chiplet actuary: A quantitative cost model and multi-chiplet architecture exploration [C]. In The 59th ACM\/IEEE Design Automation Conference, San Francisco, USA. 121\u2013126."},{"key":"e_1_3_3_1_106_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICPADS60453.2023.00126"},{"key":"e_1_3_3_1_107_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA59077.2024.00082"},{"key":"e_1_3_3_1_108_2","unstructured":"Susan Zhang Stephen Roller Naman Goyal Mikel Artetxe Moya Chen Shuohui Chen Christopher Dewan Mona Diab Xian Li Xi\u00a0Victoria Lin et\u00a0al. 2023. Opt: Open pre-trained transformer language models 2022. URL https:\/\/arxiv. org\/abs\/2205.01068 3 (2023) 19\u20130."},{"key":"e_1_3_3_1_109_2","unstructured":"Jingchen Zhu Chenhao Xue Yiqi Chen Zhao Wang and Guangyu Sun. 2024. Theseus: Exploring Efficient Wafer-Scale Chip Design for Large Language Models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2407.02079 (2024)."}],"event":{"name":"ISCA '25: Proceedings of the 52nd Annual International Symposium on Computer Architecture","location":"Tokyo Japan","acronym":"SIGARCH '25","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture"]},"container-title":["Proceedings of the 52nd Annual International Symposium on Computer Architecture"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3695053.3731045","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,21]],"date-time":"2025-06-21T10:57:44Z","timestamp":1750503464000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3695053.3731045"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,20]]},"references-count":108,"alternative-id":["10.1145\/3695053.3731045","10.1145\/3695053"],"URL":"https:\/\/doi.org\/10.1145\/3695053.3731045","relation":{},"subject":[],"published":{"date-parts":[[2025,6,20]]},"assertion":[{"value":"2025-06-20","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}