{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,7]],"date-time":"2026-03-07T17:57:35Z","timestamp":1772906255933,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":63,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,6,18]],"date-time":"2024-06-18T00:00:00Z","timestamp":1718668800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"name":"HUST","award":["T2023-PC-002"],"award-info":[{"award-number":["T2023-PC-002"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,6,18]]},"DOI":"10.1145\/3661167.3661168","type":"proceedings-article","created":{"date-parts":[[2024,6,14]],"date-time":"2024-06-14T12:24:25Z","timestamp":1718367865000},"page":"181-190","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["LEGION: Harnessing Pre-trained Language Models for GitHub Topic Recommendations with Distribution-Balance Loss"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0001-2781-9652","authenticated-orcid":false,"given":"Yen-Trang","family":"Dang","sequence":"first","affiliation":[{"name":"Hanoi University of Science and Technology, Viet Nam"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9566-324X","authenticated-orcid":false,"given":"Thanh","family":"Le-Cong","sequence":"additional","affiliation":[{"name":"The University of Melbourne, Australia"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-5440-6757","authenticated-orcid":false,"given":"Phuc-Thanh","family":"Nguyen","sequence":"additional","affiliation":[{"name":"Hanoi University of Science and Technology, Viet Nam"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7877-9438","authenticated-orcid":false,"given":"Anh M. T.","family":"Bui","sequence":"additional","affiliation":[{"name":"Hanoi University of Science and Technology, Viet Nam"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3666-4162","authenticated-orcid":false,"given":"Phuong T.","family":"Nguyen","sequence":"additional","affiliation":[{"name":"University of L'Aquila, Italy"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5044-1582","authenticated-orcid":false,"given":"Bach","family":"Le","sequence":"additional","affiliation":[{"name":"The University of Melbourne, Australia"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0788-6380","authenticated-orcid":false,"given":"Quyet-Thang","family":"Huynh","sequence":"additional","affiliation":[{"name":"Hanoi University of Science and Technology, Viet Nam"}]}],"member":"320","published-online":{"date-parts":[[2024,6,18]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1145\/3551349.3559555"},{"key":"e_1_3_2_1_2_1","volume-title":"Jared Kaplan, Harri Edwards, Yuri Burda","author":"Chen Mark","year":"2021","unstructured":"Mark Chen, Jerry Tworek, Heewoo Jun, Qiming Yuan, Henrique Ponde de\u00a0Oliveira Pinto, Jared Kaplan, Harri Edwards, Yuri Burda, Nicholas Joseph, Greg Brockman, 2021. Evaluating large language models trained on code. arXiv preprint arXiv:2107.03374 (2021)."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2021.3084050"},{"key":"e_1_3_2_1_4_1","volume-title":"Beware of the unexpected: Bimodal taint analysis. arXiv preprint arXiv:2301.10545","author":"Chow Yiu\u00a0Wai","year":"2023","unstructured":"Yiu\u00a0Wai Chow, Max Sch\u00e4fer, and Michael Pradel. 2023. Beware of the unexpected: Bimodal taint analysis. arXiv preprint arXiv:2301.10545 (2023)."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/2970276.2970347"},{"key":"e_1_3_2_1_6_1","volume-title":"ELECTRA: Pre-training Text Encoders as Discriminators Rather Than Generators. In ICLR. https:\/\/openreview.net\/pdf?id=r1xMH1BtvB","author":"Clark Kevin","year":"2020","unstructured":"Kevin Clark, Thang Luong, Quoc\u00a0V. Le, and Christopher Manning. 2020. ELECTRA: Pre-training Text Encoders as Discriminators Rather Than Generators. In ICLR. https:\/\/openreview.net\/pdf?id=r1xMH1BtvB"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/2145204.2145396"},{"key":"e_1_3_2_1_8_1","volume-title":"Artifacts: LEGION: Harnessing Pre-trained Language Models for GitHub Topic Recommendations with Distribution-Balance Loss. Figshare. https:\/\/figshare.com\/s\/6e01956fbfcd9b7ca6de","author":"Dang Yen-Trang","year":"2023","unstructured":"Yen-Trang Dang, Thanh Le-Cong, Phuc-Thanh Nguyen, Anh M.\u00a0T. Bui, Phuong\u00a0T. Nguyen, Bach Le, and Quyet-Thang Huynh. 2023. Artifacts: LEGION: Harnessing Pre-trained Language Models for GitHub Topic Recommendations with Distribution-Balance Loss. Figshare. https:\/\/figshare.com\/s\/6e01956fbfcd9b7ca6de"},{"key":"e_1_3_2_1_9_1","volume-title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In NAACL.","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In NAACL."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/3382494.3410690"},{"key":"e_1_3_2_1_11_1","volume-title":"HybridRec: A recommender system for tagging GitHub repositories. Applied Intelligence","author":"Di\u00a0Rocco Juri","year":"2022","unstructured":"Juri Di\u00a0Rocco, Davide Di\u00a0Ruscio, Claudio Di\u00a0Sipio, Phuong\u00a0T Nguyen, and Riccardo Rubei. 2022. HybridRec: A recommender system for tagging GitHub repositories. Applied Intelligence (2022), 1\u201323."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3383219.3383227"},{"key":"e_1_3_2_1_13_1","unstructured":"Shay Frendt. 2019. Introducing topics. https:\/\/github.blog\/2017-01-31-introducing-topics\/"},{"key":"e_1_3_2_1_14_1","unstructured":"Kavita Ganesan. 2017. Topic suggestions for millions of repositories. https:\/\/github.blog\/2017-07-31-topics\/"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/3611643.3616291"},{"key":"e_1_3_2_1_16_1","volume-title":"Generative adversarial nets. Advances in neural information processing systems 27","author":"Goodfellow Ian","year":"2014","unstructured":"Ian Goodfellow, Jean Pouget-Abadie, Mehdi Mirza, Bing Xu, David Warde-Farley, Sherjil Ozair, Aaron Courville, and Yoshua Bengio. 2014. Generative adversarial nets. Advances in neural information processing systems 27 (2014)."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/3447548.3467426"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3524610.3527897"},{"key":"e_1_3_2_1_19_1","volume-title":"Multilabel Classification","author":"Herrera Francisco","unstructured":"Francisco Herrera, Francisco Charte, Antonio\u00a0J Rivera, and Mar\u00eda J\u00a0del Jesus. 2016. Multilabel classification. In Multilabel Classification. Springer, 17\u201331."},{"key":"e_1_3_2_1_20_1","volume-title":"Large language models for software engineering: A systematic literature review. arXiv preprint arXiv:2308.10620","author":"Hou Xinyi","year":"2023","unstructured":"Xinyi Hou, Yanjie Zhao, Yue Liu, Zhou Yang, Kailong Wang, Li Li, Xiapu Luo, David Lo, John Grundy, and Haoyu Wang. 2023. Large language models for software engineering: A systematic literature review. arXiv preprint arXiv:2308.10620 (2023)."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3551349.3556912"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","unstructured":"Yi Huang Buse Giledereli Abdullatif K\u00f6ksal Arzucan Ozgur and Elif Ozkirimli. 2021. Balancing Methods for Multi-label Text Classification with Long-Tailed Class Distribution. 8153\u20138161. https:\/\/doi.org\/10.18653\/v1\/2021.emnlp-main.643","DOI":"10.18653\/v1\/2021.emnlp-main.643"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10664-021-09976-2"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3540250.3549175"},{"key":"e_1_3_2_1_25_1","volume-title":"David Lo, Nhat-Hoa Tran, Bui Quang-Huy, and Quyet-Thang Huynh.","author":"Le-Cong Thanh","year":"2023","unstructured":"Thanh Le-Cong, Duc-Minh Luong, Xuan Bach\u00a0D Le, David Lo, Nhat-Hoa Tran, Bui Quang-Huy, and Quyet-Thang Huynh. 2023. Invalidator: Automated patch correctness assessment via semantic and syntactic reasoning. IEEE Transactions on Software Engineering (2023)."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICSE.2013.6606583"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.703"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.324"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"crossref","unstructured":"Zachary\u00a0Chase Lipton Charles Elkan and Balakrishnan Narayanaswamy. 2014. Thresholding Classifiers to Maximize F1 Score. arxiv:1402.1892\u00a0[stat.ML]","DOI":"10.1007\/978-3-662-44851-9_15"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/3324884.3416591"},{"key":"e_1_3_2_1_31_1","volume-title":"Refining ChatGPT-generated code: Characterizing and mitigating code quality issues. arXiv preprint arXiv:2307.12596","author":"Liu Yue","year":"2023","unstructured":"Yue Liu, Thanh Le-Cong, Ratnadira Widyasari, Chakkrit Tantithamthavorn, Li Li, Xuan-Bach\u00a0D Le, and David Lo. 2023. Refining ChatGPT-generated code: Characterizing and mitigating code quality issues. arXiv preprint arXiv:2307.12596 (2023)."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/3551349.3560417"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICSE48619.2023.00094"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3340531.3412762"},{"key":"e_1_3_2_1_35_1","volume-title":"Duplicate bug report detection using an attention-based neural language model","author":"Messaoud Montassar\u00a0Ben","year":"2022","unstructured":"Montassar\u00a0Ben Messaoud, Asma Miladi, Ilyes Jenhani, Mohamed\u00a0Wiem Mkaouer, and Lobna Ghadhab. 2022. Duplicate bug report detection using an attention-based neural language model. IEEE Transactions on Reliability (2022)."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3605943"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.5555\/1953048.2078195"},{"key":"e_1_3_2_1_38_1","volume-title":"Going farther together: The impact of social capital on sustained participation in open source. In 2019 ieee\/acm 41st International Conference on Eoftware Engineering (ICSE)","author":"Qiu Huilian\u00a0Sophie","unstructured":"Huilian\u00a0Sophie Qiu, Alexander Nolte, Anita Brown, Alexander Serebrenik, and Bogdan Vasilescu. 2019. Going farther together: The impact of social capital on sustained participation in open source. In 2019 ieee\/acm 41st International Conference on Eoftware Engineering (ICSE). IEEE, 688\u2013699."},{"key":"e_1_3_2_1_39_1","unstructured":"Alec Radford Karthik Narasimhan Tim Salimans Ilya Sutskever 2018. Improving language understanding by generative pre-training. (2018)."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1145\/3188720"},{"key":"e_1_3_2_1_41_1","volume-title":"a distilled version of BERT: smaller, faster, cheaper and lighter. arXiv preprint arXiv:1910.01108","author":"Sanh Victor","year":"2019","unstructured":"Victor Sanh, Lysandre Debut, Julien Chaumond, and Thomas Wolf. 2019. DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter. arXiv preprint arXiv:1910.01108 (2019)."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/3084226.3084287"},{"key":"e_1_3_2_1_43_1","volume-title":"Proceedings, Part VII 14","author":"Shen Li","year":"2016","unstructured":"Li Shen, Zhouchen Lin, and Qingming Huang. 2016. Relay backpropagation for effective learning of deep convolutional neural networks. In Computer Vision\u2013ECCV 2016: 14th European Conference, Amsterdam, The Netherlands, October 11\u201314, 2016, Proceedings, Part VII 14. Springer, 467\u2013482."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICSE.2009.5070504"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/TSE.2010.91"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1145\/2568225.2568315"},{"key":"e_1_3_2_1_47_1","volume-title":"Attention is all you need. Advances in neural information processing systems 30","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan\u00a0N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1145\/3510003.3510062"},{"key":"e_1_3_2_1_49_1","volume-title":"How well do pre-trained contextual language representations recommend labels for GitHub issues?Knowledge-Based Systems 232","author":"Wang Jun","year":"2021","unstructured":"Jun Wang, Xiaofang Zhang, and Lin Chen. 2021. How well do pre-trained contextual language representations recommend labels for GitHub issues?Knowledge-Based Systems 232 (2021), 107476."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1007\/s40745-020-00253-"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10664-017-9533-1"},{"key":"e_1_3_2_1_52_1","volume-title":"Codet5: Identifier-aware unified pre-trained encoder-decoder models for code understanding and generation. arXiv preprint arXiv:2109.00859","author":"Wang Yue","year":"2021","unstructured":"Yue Wang, Weishi Wang, Shafiq Joty, and Steven\u00a0CH Hoi. 2021. Codet5: Identifier-aware unified pre-trained encoder-decoder models for code understanding and generation. arXiv preprint arXiv:2109.00859 (2021)."},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/SANER56733.2023.00025"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1444"},{"key":"e_1_3_2_1_55_1","volume-title":"Proceedings, Part IV 16","author":"Wu Tong","year":"2020","unstructured":"Tong Wu, Qingqiu Huang, Ziwei Liu, Yu Wang, and Dahua Lin. 2020. Distribution-balanced loss for multi-label classification in long-tailed datasets. In Computer Vision\u2013ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part IV 16. Springer, 162\u2013178."},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1109\/MSR.2013.6624040"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1109\/ASE56229.2023.00023"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICSME46990.2020.00017"},{"key":"e_1_3_2_1_59_1","volume-title":"The Devil is in the Tails: How Long-Tailed Code Distributions Impact Large Language Models. arXiv preprint arXiv:2309.03567","author":"Zhou Xin","year":"2023","unstructured":"Xin Zhou, Kisub Kim, Bowen Xu, Jiakun Liu, DongGyun Han, and David Lo. 2023. The Devil is in the Tails: How Long-Tailed Code Distributions Impact Large Language Models. arXiv preprint arXiv:2309.03567 (2023)."},{"key":"e_1_3_2_1_60_1","volume-title":"Patchzero: Zero-shot automatic patch correctness assessment. arXiv preprint arXiv:2303.00202","author":"Zhou Xin","year":"2023","unstructured":"Xin Zhou, Bowen Xu, Kisub Kim, DongGyun Han, Thanh Le-Cong, Junda He, Bach Le, and David Lo. 2023. Patchzero: Zero-shot automatic patch correctness assessment. arXiv preprint arXiv:2303.00202 (2023)."},{"key":"e_1_3_2_1_61_1","volume-title":"2023 38th IEEE\/ACM International Conference on Automated Software Engineering (ASE). IEEE Computer Society, 40\u201352","author":"Zhout Xin","year":"2023","unstructured":"Xin Zhout, Kisub Kim, Bowen Xu, Jiakun Liu, DongGyun Han, and David Lo. 2023. The Devil is in the Tails: How Long-Tailed Code Distributions Impact Large Language Models. In 2023 38th IEEE\/ACM International Conference on Automated Software Engineering (ASE). IEEE Computer Society, 40\u201352."},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.11"},{"key":"e_1_3_2_1_63_1","volume-title":"Proceedings of the 20th Chinese National Conference on Computational Linguistics. 1218\u20131227","author":"Zhuang Liu","year":"2021","unstructured":"Liu Zhuang, Lin Wayne, Shi Ya, and Zhao Jun. 2021. A Robustly Optimized BERT Pre-training Approach with Post-training. In Proceedings of the 20th Chinese National Conference on Computational Linguistics. 1218\u20131227."}],"event":{"name":"EASE 2024: 28th International Conference on Evaluation and Assessment in Software Engineering","location":"Salerno Italy","acronym":"EASE 2024"},"container-title":["Proceedings of the 28th International Conference on Evaluation and Assessment in Software Engineering"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3661167.3661168","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3661167.3661168","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T11:16:44Z","timestamp":1755861404000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3661167.3661168"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,6,18]]},"references-count":63,"alternative-id":["10.1145\/3661167.3661168","10.1145\/3661167"],"URL":"https:\/\/doi.org\/10.1145\/3661167.3661168","relation":{},"subject":[],"published":{"date-parts":[[2024,6,18]]},"assertion":[{"value":"2024-06-18","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}