{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,25]],"date-time":"2025-09-25T18:08:28Z","timestamp":1758823708271,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":25,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,5,7]],"date-time":"2024-05-07T00:00:00Z","timestamp":1715040000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,5,7]]},"DOI":"10.1145\/3629526.3645045","type":"proceedings-article","created":{"date-parts":[[2024,5,6]],"date-time":"2024-05-06T20:44:32Z","timestamp":1715028272000},"page":"201-210","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["Leftovers for LLaMA"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-9547-5968","authenticated-orcid":false,"given":"Ravi Kumar","family":"Singh","sequence":"first","affiliation":[{"name":"TCS Research, Thane, India"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-9887-8075","authenticated-orcid":false,"given":"Likhith","family":"Bandamudi","sequence":"additional","affiliation":[{"name":"TCS Research, Thane, India"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4708-0496","authenticated-orcid":false,"given":"Shruti","family":"Kunde","sequence":"additional","affiliation":[{"name":"TCS Research, Mumbai, India"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4654-1246","authenticated-orcid":false,"given":"Mayank","family":"Mishra","sequence":"additional","affiliation":[{"name":"TCS Research, Thane, India"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3712-1784","authenticated-orcid":false,"given":"Rekha","family":"Singhal","sequence":"additional","affiliation":[{"name":"TCS Research, Paceport, NY, USA"}]}],"member":"320","published-online":{"date-parts":[[2024,5,7]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"[n. d.]. Amazon SageMaker. https:\/\/docs.aws.amazon.com\/sagemaker\/latest\/dg\/whatis.html."},{"key":"e_1_3_2_1_2_1","unstructured":"[n. d.]. AWS. https:\/\/aws.amazon.com\/."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC41404.2022.00051"},{"key":"e_1_3_2_1_4_1","volume-title":"Benchmarking LLM powered Chatbots: Methods and Metrics. arXiv preprint arXiv:2308.04624","author":"Banerjee Debarag","year":"2023","unstructured":"Debarag Banerjee, Pooja Singh, Arjun Avadhanam, and Saksham Srivastava. 2023. Benchmarking LLM powered Chatbots: Methods and Metrics. arXiv preprint arXiv:2308.04624 (2023)."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"crossref","unstructured":"Alexander Borzunov Dmitry Baranchuk Tim Dettmers Max Ryabinin Younes Belkada Artem Chumachenko Pavel Samygin and Colin Raffel. 2022. Distributed Inference and Fine-tuning of Large Language Models Over The Internet. (2022).","DOI":"10.18653\/v1\/2023.acl-demo.54"},{"key":"e_1_3_2_1_6_1","volume-title":"Petals: Collaborative inference and fine-tuning of large models. arXiv preprint arXiv:2209.01188","author":"Borzunov Alexander","year":"2022","unstructured":"Alexander Borzunov, Dmitry Baranchuk, Tim Dettmers, Max Ryabinin, Younes Belkada, Artem Chumachenko, Pavel Samygin, and Colin Raffel. 2022. Petals: Collaborative inference and fine-tuning of large models. arXiv preprint arXiv:2209.01188 (2022)."},{"key":"e_1_3_2_1_7_1","volume-title":"Petals: Collaborative Inference and Fine-tuning of Large Models. arXiv:2209.01188 [cs.LG]","author":"Borzunov Alexander","year":"2023","unstructured":"Alexander Borzunov, Dmitry Baranchuk, Tim Dettmers, Max Ryabinin, Younes Belkada, Artem Chumachenko, Pavel Samygin, and Colin Raffel. 2023. Petals: Collaborative Inference and Fine-tuning of Large Models. arXiv:2209.01188 [cs.LG]"},{"key":"e_1_3_2_1_8_1","unstructured":"Jennifer D'Souza. [n. d.]. A Review of Transformer Models. ([n. d.])."},{"key":"e_1_3_2_1_9_1","volume-title":"Recommender systems in the era of large language models (llms). arXiv preprint arXiv:2307.02046","author":"Fan Wenqi","year":"2023","unstructured":"Wenqi Fan, Zihuai Zhao, Jiatong Li, Yunqing Liu, Xiaowei Mei, Yiqi Wang, Jiliang Tang, and Qing Li. 2023. Recommender systems in the era of large language models (llms). arXiv preprint arXiv:2307.02046 (2023)."},{"key":"e_1_3_2_1_10_1","first-page":"9","article-title":"A review of ChatGPT AI's impact on several business sectors","volume":"1","author":"Shaji George A","year":"2023","unstructured":"A Shaji George and AS Hovan George. 2023. A review of ChatGPT AI's impact on several business sectors. Partners Universal International Innovation Journal 1, 1 (2023), 9--23.","journal-title":"Partners Universal International Innovation Journal"},{"key":"e_1_3_2_1_11_1","volume-title":"Naveed Akhtar, Jia Wu, Seyedali Mirjalili, et al.","author":"Hadi Muhammad Usman","year":"2023","unstructured":"Muhammad Usman Hadi, Rizwan Qureshi, Abbas Shah, Muhammad Irfan, Anas Zafar, Muhammad Bilal Shaikh, Naveed Akhtar, Jia Wu, Seyedali Mirjalili, et al. 2023. Large language models: a comprehensive survey of its applications, challenges, limitations, and future prospects. (2023)."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.13033\/ijahp.v6i1.226"},{"key":"e_1_3_2_1_13_1","volume-title":"Recommender AI Agent: Integrating Large Language Models for Interactive Recommendations. arXiv preprint arXiv:2308.16505","author":"Huang Xu","year":"2023","unstructured":"Xu Huang, Jianxun Lian, Yuxuan Lei, Jing Yao, Defu Lian, and Xing Xie. 2023. Recommender AI Agent: Integrating Large Language Models for Interactive Recommendations. arXiv preprint arXiv:2308.16505 (2023)."},{"key":"e_1_3_2_1_14_1","volume-title":"Large language model empowered agents for simulating macroeconomic activities. arXiv preprint arXiv:2310.10436","author":"Li Nian","year":"2023","unstructured":"Nian Li, Chen Gao, Yong Li, and Qingmin Liao. 2023. Large language model empowered agents for simulating macroeconomic activities. arXiv preprint arXiv:2310.10436 (2023)."},{"key":"e_1_3_2_1_15_1","unstructured":"Jianghao Lin Xinyi Dai Yunjia Xi Weiwen Liu Bo Chen Xiangyang Li Chenxu Zhu Huifeng Guo Yong Yu Ruiming Tang et al. 2023. How Can Recommender Systems Benefit from Large Language Models: A Survey. arXiv preprint arXiv:2306.05817 (2023)."},{"key":"e_1_3_2_1_16_1","volume-title":"Automating Customer Service using LangChain: Building custom open-source GPT Chatbot for organizations. arXiv preprint arXiv:2310.05421","author":"Pandya Keivalya","year":"2023","unstructured":"Keivalya Pandya and Mehfuza Holia. 2023. Automating Customer Service using LangChain: Building custom open-source GPT Chatbot for organizations. arXiv preprint arXiv:2310.05421 (2023)."},{"key":"e_1_3_2_1_17_1","volume-title":"International Conference on Machine Learning. PMLR","author":"Rajbhandari Samyam","year":"2022","unstructured":"Samyam Rajbhandari, Conglong Li, Zhewei Yao, and Minjia Zhang. 2022. Deepspeed-moe: Advancing mixture-of-experts inference and training to power next-generation ai scale. In International Conference on Machine Learning. PMLR, 18332--18346."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394486.3406703"},{"key":"e_1_3_2_1_19_1","unstructured":"Ying Sheng Lianmin Zheng Binhang Yuan Zhuohan Li Max Ryabinin and Chen. 2023. FlexGen: High-Throughput Generative Inference of Large Language Models with a Single GPU. (2023)."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"crossref","DOI":"10.1109\/IJCNN.2004.1381208","volume-title":"2004 IEEE International Joint Conference on Neural Networks (IEEE Cat. No. 04CH37541)","volume":"4","author":"Tan Ah-Hwee","year":"2004","unstructured":"Ah-Hwee Tan. 2004. FALCON: A fusion architecture for learning, cognition, and navigation. In 2004 IEEE International Joint Conference on Neural Networks (IEEE Cat. No. 04CH37541), Vol. 4. IEEE, 3297--3302."},{"key":"e_1_3_2_1_21_1","volume-title":"Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Thibaut Lavril, and Gautier Izacard. 2023. Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)."},{"key":"e_1_3_2_1_22_1","unstructured":"Hugo Touvron Louis Martin Kevin Stone Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra et al. 2023. Llama 2: Open Foundation and Fine-Tuned Chat Models. arXiv:2307.09288 [cs.CL]"},{"volume-title":"Workshop, Teven Le Scao, Angela Fan, Christopher Akiki, Ellie Pavlick, Suzana Ili?, Daniel Hesslow, Castagn\u00e9, Fran\u00e7ois Yvon, et al. 2022","year":"2022","key":"e_1_3_2_1_23_1","unstructured":"BigScience Workshop, Teven Le Scao, Angela Fan, Christopher Akiki, Ellie Pavlick, Suzana Ili?, Daniel Hesslow, Castagn\u00e9, Fran\u00e7ois Yvon, et al. 2022. Bloom: A 176b-parameter open-access multilingual language model. arXiv preprint arXiv:2211.05100 (2022)."},{"key":"e_1_3_2_1_24_1","volume-title":"Fast Distributed Inference Serving for Large Language Models. arXiv preprint arXiv:2305.05920","author":"Wu Bingyang","year":"2023","unstructured":"Bingyang Wu, Yinmin Zhong, Zili Zhang, Gang Huang, Xuanzhe Liu, and Xin Jin. 2023. Fast Distributed Inference Serving for Large Language Models. arXiv preprint arXiv:2305.05920 (2023)."},{"key":"e_1_3_2_1_25_1","volume-title":"Todor Mihaylov, Myle Ott, Sam Shleifer, Kurt Shuster, Daniel Simig, Punit Singh Koura, Anjali Sridhar, Tianlu Wang, and Luke Zettlemoyer.","author":"Zhang Susan","year":"2022","unstructured":"Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen, Christopher Dewan, Mona Diab, Xian Li, Xi Victoria Lin, Todor Mihaylov, Myle Ott, Sam Shleifer, Kurt Shuster, Daniel Simig, Punit Singh Koura, Anjali Sridhar, Tianlu Wang, and Luke Zettlemoyer. 2022. OPT: Open Pre-trained Transformer Language Models. arXiv:2205.01068 [cs.CL]"}],"event":{"name":"ICPE '24: 15th ACM\/SPEC International Conference on Performance Engineering","sponsor":["SIGMETRICS ACM Special Interest Group on Measurement and Evaluation","SIGSOFT ACM Special Interest Group on Software Engineering"],"location":"London United Kingdom","acronym":"ICPE '24"},"container-title":["Proceedings of the 15th ACM\/SPEC International Conference on Performance Engineering"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3629526.3645045","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3629526.3645045","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T23:48:04Z","timestamp":1755906484000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3629526.3645045"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,5,7]]},"references-count":25,"alternative-id":["10.1145\/3629526.3645045","10.1145\/3629526"],"URL":"https:\/\/doi.org\/10.1145\/3629526.3645045","relation":{},"subject":[],"published":{"date-parts":[[2024,5,7]]},"assertion":[{"value":"2024-05-07","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}