{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,12]],"date-time":"2026-01-12T13:08:46Z","timestamp":1768223326093,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":57,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,12,17]]},"DOI":"10.1145\/3777867.3777878","type":"proceedings-article","created":{"date-parts":[[2026,1,12]],"date-time":"2026-01-12T09:57:39Z","timestamp":1768211859000},"page":"113-123","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["HALO-GPT:Hindi Active Learning with Oracle GPT-3.5"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-3972-2227","authenticated-orcid":false,"given":"Ajanta","family":"Maurya","sequence":"first","affiliation":[{"name":"Indian Institute of Technology Guwahati, Guwahati, Assam, India"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7856-5322","authenticated-orcid":false,"given":"V. Vijaya","family":"Saradhi","sequence":"additional","affiliation":[{"name":"Indian Institute of Technology Guwahati, Guwahati, Assam, India"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0024-3358","authenticated-orcid":false,"given":"Ashish","family":"Anand","sequence":"additional","affiliation":[{"name":"Indian Institute of Technology Guwahati, Guwahati, Assam, India"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2026,1,12]]},"reference":[{"key":"e_1_3_3_2_2_2","doi-asserted-by":"crossref","unstructured":"Ankit Agrawal Sarsij Tripathi and Manu Vardhan. 2021. Active learning approach using a modified least confidence sampling strategy for named entity recognition. Progress in Artificial Intelligence 10 2 (2021) 113\u2013128.","DOI":"10.1007\/s13748-021-00230-w"},{"key":"e_1_3_3_2_3_2","volume-title":"International Conference on Learning Representations","author":"Ash Jordan\u00a0T","unstructured":"Jordan\u00a0T Ash, Chicheng Zhang, Akshay Krishnamurthy, John Langford, and Alekh Agarwal. [n. d.]. Deep Batch Active Learning by Diverse, Uncertain Gradient Lower Bounds. In International Conference on Learning Representations."},{"key":"e_1_3_3_2_4_2","doi-asserted-by":"crossref","first-page":"362","DOI":"10.1145\/3299869.3314036","volume-title":"Proceedings of the 2019 International Conference on Management of Data","author":"Bach Stephen\u00a0H","year":"2019","unstructured":"Stephen\u00a0H Bach, Daniel Rodriguez, Yintao Liu, Chong Luo, Haidong Shao, Cassandra Xia, Souvik Sen, Alex Ratner, Braden Hancock, Houman Alborzi, et\u00a0al. 2019. Snorkel drybell: A case study in deploying weak supervision at industrial scale. In Proceedings of the 2019 International Conference on Management of Data. 362\u2013375."},{"key":"e_1_3_3_2_5_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.eval4nlp-1.8"},{"key":"e_1_3_3_2_6_2","doi-asserted-by":"crossref","unstructured":"Yukun Chen Thomas\u00a0A Lasko Qiaozhu Mei Joshua\u00a0C Denny and Hua Xu. 2015. A study of active learning methods for named entity recognition in clinical text. Journal of biomedical informatics 58 (2015) 11\u201318.","DOI":"10.1016\/j.jbi.2015.09.010"},{"key":"e_1_3_3_2_7_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.747"},{"key":"e_1_3_3_2_8_2","doi-asserted-by":"publisher","DOI":"10.5555\/1619410.1619452"},{"key":"e_1_3_3_2_9_2","unstructured":"TensorFlow Datasets. 2024. WikiAnn: Multilingual Named Entity Recognition. https:\/\/www.tensorflow.org\/datasets\/catalog\/wikiann. Accessed: 2024-07-15."},{"key":"e_1_3_3_2_10_2","first-page":"1","volume-title":"Proceedings of the First International Conference on Human Language Technology Research","author":"David Yarowsky","year":"2001","unstructured":"Yarowsky David, Ngai Grace, Wicentowski Richard, et\u00a0al. 2001. Inducing multilingual text analysis tools via robust projection across aligned corpora. In Proceedings of the First International Conference on Human Language Technology Research. 1\u20138."},{"key":"e_1_3_3_2_11_2","unstructured":"Bosheng Ding Chengwei Qin Linlin Liu Yew\u00a0Ken Chia Shafiq Joty Boyang Li and Lidong Bing. 2022. Is gpt-3 a good data annotator? arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2212.10450 (2022)."},{"key":"e_1_3_3_2_12_2","doi-asserted-by":"crossref","unstructured":"Bo Du Zengmao Wang Lefei Zhang Liangpei Zhang Wei Liu Jialie Shen and Dacheng Tao. 2017. Exploring Representativeness and Informativeness for Active Learning. IEEE Transactions on Cybernetics (2017).","DOI":"10.1109\/TCYB.2015.2496974"},{"key":"e_1_3_3_2_13_2","doi-asserted-by":"crossref","unstructured":"Asif Ekbal Sriparna Saha and Utpal\u00a0Kumar Sikdar. 2016. On active annotation for named entity recognition. International Journal of Machine Learning and Cybernetics 7 (2016) 623\u2013640.","DOI":"10.1007\/s13042-014-0275-8"},{"key":"e_1_3_3_2_14_2","doi-asserted-by":"crossref","unstructured":"Jason\u00a0A Fries Paroma Varma Vincent\u00a0S Chen Ke Xiao Heliodoro Tejeda Priyanka Saha Jared Dunnmon Henry Chubb Shiraz Maskatia Madalina Fiterau et\u00a0al. 2019. Weakly supervised classification of aortic valve malformations using unlabeled cardiac MRI sequences. Nature communications 10 1 (2019) 3111.","DOI":"10.1038\/s41467-019-11012-3"},{"key":"e_1_3_3_2_15_2","first-page":"1183","volume-title":"International conference on machine learning","author":"Gal Yarin","year":"2017","unstructured":"Yarin Gal, Riashat Islam, and Zoubin Ghahramani. 2017. Deep bayesian active learning with image data. In International conference on machine learning. PMLR, 1183\u20131192."},{"key":"e_1_3_3_2_16_2","first-page":"673","volume-title":"Machine Learning for Healthcare Conference","author":"Gao Chufan","year":"2022","unstructured":"Chufan Gao, Mononito Goswami, Jieshi Chen, and Artur Dubrawski. 2022. Classifying unstructured clinical notes via automatic weak supervision. In Machine Learning for Healthcare Conference. PMLR, 673\u2013690."},{"key":"e_1_3_3_2_17_2","doi-asserted-by":"crossref","unstructured":"Fabrizio Gilardi Meysam Alizadeh and Ma\u00ebl Kubli. 2023. ChatGPT outperforms crowd workers for text-annotation tasks. Proceedings of the National Academy of Sciences 120 30 (2023) e2305016120.","DOI":"10.1073\/pnas.2305016120"},{"key":"e_1_3_3_2_18_2","first-page":"82","volume-title":"Machine Learning for Health (ML4H)","author":"Goel Akshay","year":"2023","unstructured":"Akshay Goel, Almog Gueta, Omry Gilon, Chang Liu, Sofia Erell, Lan\u00a0Huong Nguyen, Xiaohong Hao, Bolous Jaber, Shashir Reddy, Rupesh Kartha, et\u00a0al. 2023. Llms accelerate annotation for medical information extraction. In Machine Learning for Health (ML4H). PMLR, 82\u2013100."},{"key":"e_1_3_3_2_19_2","unstructured":"Neil Houlsby Ferenc Husz\u00e1r Zoubin Ghahramani and M\u00e1t\u00e9 Lengyel. 2011. Bayesian Active Learning for Classification and Preference Learning. stat 1050 (2011) 24."},{"key":"e_1_3_3_2_20_2","unstructured":"Sheng-Jun Huang Rong Jin and Zhi-Hua Zhou. 2010. Active learning by querying informative and representative examples. Advances in neural information processing systems 23 (2010)."},{"key":"e_1_3_3_2_21_2","unstructured":"Suramya Jadhav Abhay Shanbhag Amogh Thakurdesai Ridhima Sinare and Raviraj Joshi. 2024. On Limitations of LLM as Annotator for Low Resource Languages. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2411.17637 (2024)."},{"key":"e_1_3_3_2_22_2","unstructured":"Simran Khanuja Diksha Bansal Sarvesh Mehtani Savya Khosla Atreyee Dey Balaji Gopalan Dilip\u00a0Kumar Margam Pooja Aggarwal Rajiv\u00a0Teja Nagipogu Shachi Dave et\u00a0al. 2021. Muril: Multilingual representations for indian languages. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2103.10730 (2021)."},{"key":"e_1_3_3_2_23_2","first-page":"397","volume-title":"Joint European Conference on Machine Learning and Knowledge Discovery in Databases","author":"Kholodna Nataliia","year":"2024","unstructured":"Nataliia Kholodna, Sahib Julka, Mohammad Khodadadi, Muhammed\u00a0Nurullah Gumus, and Michael Granitzer. 2024. Llms in the loop: Leveraging large language model annotations for active learning in low-resource languages. In Joint European Conference on Machine Learning and Knowledge Discovery in Databases. Springer, 397\u2013412."},{"key":"e_1_3_3_2_24_2","doi-asserted-by":"crossref","unstructured":"Sergei Koltcov Anton Surkov Olessia Koltsova and Vera Ignatenko. 2024. Using large language models for extracting and pre-annotating texts on mental health from noisy data in a low-resource language. PeerJ Computer Science 10 (2024) e2395.","DOI":"10.7717\/peerj-cs.2395"},{"key":"e_1_3_3_2_25_2","doi-asserted-by":"crossref","unstructured":"Panteleimon Krasadakis Evangelos Sakkopoulos and Vassilios\u00a0S Verykios. 2024. A survey on challenges and advances in natural language processing with a focus on legal informatics and low-resource languages. Electronics 13 3 (2024) 648.","DOI":"10.3390\/electronics13030648"},{"key":"e_1_3_3_2_26_2","doi-asserted-by":"crossref","unstructured":"J\u00a0Richard Landis and Gary\u00a0G Koch. 1977. The measurement of observer agreement for categorical data. biometrics (1977) 159\u2013174.","DOI":"10.2307\/2529310"},{"key":"e_1_3_3_2_27_2","doi-asserted-by":"publisher","DOI":"10.1145\/219587.219592"},{"key":"e_1_3_3_2_28_2","unstructured":"Yinghao Li Pranav Shetty Lucas Liu Chao Zhang and Le Song. 2021. Bertifying the hidden markov model for multi-source weakly supervised named entity recognition. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2105.12848 (2021)."},{"key":"e_1_3_3_2_29_2","first-page":"337","volume-title":"Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing: System Demonstrations","author":"Lison Pierre","year":"2021","unstructured":"Pierre Lison, Jeremy Barnes, and Aliaksandr Hubin. 2021. skweak: Weak Supervision Made Easy for NLP. In Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing: System Demonstrations. 337\u2013346."},{"key":"e_1_3_3_2_30_2","doi-asserted-by":"crossref","first-page":"1518","DOI":"10.18653\/v1\/2020.acl-main.139","volume-title":"Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics","author":"Lison Pierre","year":"2020","unstructured":"Pierre Lison, Jeremy Barnes, Aliaksandr Hubin, and Samia Touileb. 2020. Named Entity Recognition without Labelled Data: A Weak Supervision Approach. In Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics. 1518\u20131533."},{"key":"e_1_3_3_2_31_2","doi-asserted-by":"crossref","unstructured":"Kun Liu Yao Fu Chuanqi Tan Mosha Chen Ningyu Zhang Songfang Huang and Sheng Gao. 2021. Noisy-labeled NER with confidence estimation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2104.04318 (2021).","DOI":"10.18653\/v1\/2021.naacl-main.269"},{"key":"e_1_3_3_2_32_2","first-page":"13332","volume-title":"Proceedings of the AAAI Conference on Artificial Intelligence","volume":"34","author":"Mallinar Neil","year":"2020","unstructured":"Neil Mallinar, Abhishek Shah, Tin\u00a0Kam Ho, Rajendra Ugrani, and Ayush Gupta. 2020. Iterative data programming for expanding text classification corpora. In Proceedings of the AAAI Conference on Artificial Intelligence , Vol.\u00a034. 13332\u201313337."},{"key":"e_1_3_3_2_33_2","unstructured":"Shervin Malmasi Anjie Fang Besnik Fetahu Sudipta Kar and Oleg Rokhlenko. 2022. MultiCoNER: A large-scale multilingual dataset for complex named entity recognition. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2208.14536 (2022)."},{"key":"e_1_3_3_2_34_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.51"},{"key":"e_1_3_3_2_35_2","unstructured":"Arnav Mhaske Harshit Kedia Sumanth Doddapaneni Mitesh\u00a0M Khapra Pratyush Kumar Rudra Murthy\u00a0V and Anoop Kunchukuttan. 2022. Naamapadam: a large-scale named entity annotated data for Indic languages. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2212.10168 (2022)."},{"key":"e_1_3_3_2_36_2","first-page":"38","volume-title":"Proceedings on","author":"Mohta Jay","year":"2023","unstructured":"Jay Mohta, Kenan Ak, Yan Xu, and Mingwei Shen. 2023. Are large language models good annotators?. In Proceedings on. PMLR, 38\u201348."},{"key":"e_1_3_3_2_37_2","unstructured":"Rudra Murthy Pallab Bhattacharjee Rahul Sharnagat Jyotsana Khatri Diptesh Kanojia and Pushpak Bhattacharyya. 2022. Hiner: A large hindi named entity recognition dataset. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2204.13743 (2022)."},{"key":"e_1_3_3_2_38_2","doi-asserted-by":"crossref","unstructured":"Arbi\u00a0Haza Nasution and Aytu\u011f Onan. 2024. Chatgpt label: Comparing the quality of human-generated and llm-generated annotations in low-resource language nlp tasks. IEEE Access 12 (2024) 71876\u201371900.","DOI":"10.1109\/ACCESS.2024.3402809"},{"key":"e_1_3_3_2_39_2","unstructured":"OpenAI. 2023. OpenAI Platform Models Documentation. https:\/\/platform.openai.com\/docs\/models Accessed: 2025-06-25."},{"key":"e_1_3_3_2_40_2","unstructured":"Maja Pavlovic and Massimo Poesio. 2024. The Effectiveness of LLMs as Annotators: A Comparative Overview and Empirical Analysis of Direct Representation. LREC-COLING 2024 (2024) 100."},{"key":"e_1_3_3_2_41_2","doi-asserted-by":"crossref","unstructured":"Minlong Peng Xiaoyu Xing Qi Zhang Jinlan Fu and Xuanjing Huang. 2019. Distantly supervised named entity recognition using positive-unlabeled learning. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1906.01378 (2019).","DOI":"10.18653\/v1\/P19-1231"},{"key":"e_1_3_3_2_42_2","first-page":"575","volume-title":"2021 IEEE\/ACM 18th International Conference on Mining Software Repositories (MSR)","author":"Rao Nikitha","year":"2021","unstructured":"Nikitha Rao, Chetan Bansal, and Joe Guan. 2021. Search4Code: Code search intent classification using weak supervision. In 2021 IEEE\/ACM 18th International Conference on Mining Software Repositories (MSR). IEEE, 575\u2013579."},{"key":"e_1_3_3_2_43_2","unstructured":"Alexander\u00a0J Ratner Christopher\u00a0M De\u00a0Sa Sen Wu Daniel Selsam and Christopher R\u00e9. 2016. Data programming: Creating large training sets quickly. Advances in neural information processing systems 29 (2016)."},{"key":"e_1_3_3_2_44_2","doi-asserted-by":"crossref","unstructured":"Pengzhen Ren Yun Xiao Xiaojun Chang Po-Yao Huang Zhihui Li Brij\u00a0B Gupta Xiaojiang Chen and Xin Wang. 2021. A survey of deep active learning. ACM computing surveys (CSUR) 54 9 (2021) 1\u201340.","DOI":"10.1145\/3472291"},{"key":"e_1_3_3_2_45_2","first-page":"133","volume-title":"Proceedings of the 18th Linguistic Annotation Workshop (LAW-XVIII)","author":"R\u00f8nningstad Egil","year":"2024","unstructured":"Egil R\u00f8nningstad, Erik Velldal, and Lilja \u00d8vrelid. 2024. A GPT among annotators: LLM-based entity-level sentiment annotation. In Proceedings of the 18th Linguistic Annotation Workshop (LAW-XVIII). 133\u2013139."},{"key":"e_1_3_3_2_46_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i04.6009"},{"key":"e_1_3_3_2_47_2","doi-asserted-by":"publisher","DOI":"10.5555\/647967.741626"},{"key":"e_1_3_3_2_48_2","volume-title":"International Conference on Learning Representations","author":"Sener Ozan","year":"2018","unstructured":"Ozan Sener and Silvio Savarese. 2018. Active Learning for Convolutional Neural Networks: A Core-Set Approach. In International Conference on Learning Representations."},{"key":"e_1_3_3_2_49_2","unstructured":"Burr Settles. 2009. Active learning literature survey. (2009)."},{"key":"e_1_3_3_2_50_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W17-2630"},{"key":"e_1_3_3_2_51_2","doi-asserted-by":"crossref","first-page":"27","DOI":"10.18653\/v1\/W18-2405","volume-title":"Proceedings of the seventh named entities workshop","author":"Singh Vinay","year":"2018","unstructured":"Vinay Singh, Deepanshu Vijay, Syed\u00a0Sarfaraz Akhtar, and Manish Shrivastava. 2018. Named entity recognition for Hindi-English code-mixed social media text. In Proceedings of the seventh named entities workshop. 27\u201335."},{"key":"e_1_3_3_2_52_2","doi-asserted-by":"crossref","unstructured":"Ryan Smith Jason\u00a0A Fries Braden Hancock and Stephen\u00a0H Bach. 2024. Language models in the loop: Incorporating prompting into weak supervision. ACM\/JMS Journal of Data Science 1 2 (2024) 1\u201330.","DOI":"10.1145\/3617130"},{"key":"e_1_3_3_2_53_2","first-page":"154","volume-title":"Proceedings of the Eighth Workshop on Noisy User-generated Text (W-NUT 2022)","author":"Sumukh S","year":"2022","unstructured":"S Sumukh and Manish Shrivastava. 2022. \u201cKanglish alli names!\u201d Named Entity Recognition for Kannada-English Code-Mixed Social Media Data. In Proceedings of the Eighth Workshop on Noisy User-generated Text (W-NUT 2022). 154\u2013161."},{"key":"e_1_3_3_2_54_2","first-page":"930","volume-title":"Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing","author":"Tan Zhen","year":"2024","unstructured":"Zhen Tan, Dawei Li, Song Wang, Alimohammad Beigi, Bohan Jiang, Amrita Bhattacharjee, Mansooreh Karami, Jundong Li, Lu Cheng, and Huan Liu. 2024. Large language models for data annotation and synthesis: A survey. In Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing. 930\u2013957."},{"key":"e_1_3_3_2_55_2","doi-asserted-by":"crossref","unstructured":"Yanshan Wang Sunghwan Sohn Sijia Liu Feichen Shen Liwei Wang Elizabeth\u00a0J Atkinson Shreyasee Amin and Hongfang Liu. 2019. A clinical text classification paradigm using weak supervision and deep representation. BMC medical informatics and decision making 19 1 (2019) 1.","DOI":"10.1186\/s12911-018-0723-6"},{"key":"e_1_3_3_2_56_2","first-page":"399","volume-title":"Medical Image Computing and Computer Assisted Intervention- MICCAI 2017: 20th International Conference, Quebec City, QC, Canada, September 11-13, 2017, Proceedings, Part III 20","author":"Yang Lin","year":"2017","unstructured":"Lin Yang, Yizhe Zhang, Jianxu Chen, Siyuan Zhang, and Danny\u00a0Z Chen. 2017. Suggestive annotation: A deep active learning framework for biomedical image segmentation. In Medical Image Computing and Computer Assisted Intervention- MICCAI 2017: 20th International Conference, Quebec City, QC, Canada, September 11-13, 2017, Proceedings, Part III 20. Springer, 399\u2013407."},{"key":"e_1_3_3_2_57_2","unstructured":"Jieyu Zhang Yue Yu Yinghao Li Yujing Wang Yaming Yang Mao Yang and Alexander Ratner. 2021. WRENCH: A comprehensive benchmark for weak supervision. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2109.11377 (2021)."},{"key":"e_1_3_3_2_58_2","doi-asserted-by":"crossref","first-page":"13088","DOI":"10.18653\/v1\/2023.findings-emnlp.872","volume-title":"Findings of the Association for Computational Linguistics: EMNLP 2023","author":"Zhang Ruoyu","year":"2023","unstructured":"Ruoyu Zhang, Yanzeng Li, Yongliang Ma, Ming Zhou, and Lei Zou. 2023. LLMaAA: Making Large Language Models as Active Annotators. In Findings of the Association for Computational Linguistics: EMNLP 2023. 13088\u201313103."}],"event":{"name":"FIRE 2025: Forum for Information Retrieval Evaluation","location":"Varanasi India","acronym":"FIRE 2025"},"container-title":["Proceedings of the 17th annual meeting of the Forum for Information Retrieval Evaluation"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3777867.3777878","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,12]],"date-time":"2026-01-12T09:58:36Z","timestamp":1768211916000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3777867.3777878"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,12,17]]},"references-count":57,"alternative-id":["10.1145\/3777867.3777878","10.1145\/3777867"],"URL":"https:\/\/doi.org\/10.1145\/3777867.3777878","relation":{},"subject":[],"published":{"date-parts":[[2025,12,17]]},"assertion":[{"value":"2026-01-12","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}