{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,15]],"date-time":"2025-10-15T00:40:46Z","timestamp":1760488846166,"version":"build-2065373602"},"publisher-location":"New York, NY, USA","reference-count":36,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,8,27]]},"DOI":"10.1145\/3704268.3742691","type":"proceedings-article","created":{"date-parts":[[2025,8,27]],"date-time":"2025-08-27T16:44:47Z","timestamp":1756313087000},"page":"1-10","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["SoAC and SoACer: A Sector-Based Corpus and LLM-Based Framework for Sectoral Website Classification"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-5803-7316","authenticated-orcid":false,"given":"Shahriar","family":"Shayesteh","sequence":"first","affiliation":[{"name":"Pennsylvania State University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-6474-8199","authenticated-orcid":false,"given":"Mukund","family":"Srinath","sequence":"additional","affiliation":[{"name":"Pennsylvania State University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-5639-8379","authenticated-orcid":false,"given":"Lee","family":"Matheson","sequence":"additional","affiliation":[{"name":"Future of Privacy Forum"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8120-1012","authenticated-orcid":false,"given":"Lu","family":"Xian","sequence":"additional","affiliation":[{"name":"University of Michigan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5165-1602","authenticated-orcid":false,"given":"Sinjoy","family":"Saha","sequence":"additional","affiliation":[{"name":"Pennsylvania State University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1931-585X","authenticated-orcid":false,"given":"C. Lee","family":"Giles","sequence":"additional","affiliation":[{"name":"Pennsylvania State University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1235-3754","authenticated-orcid":false,"given":"Shomir","family":"Wilson","sequence":"additional","affiliation":[{"name":"Pennsylvania State University"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,8,27]]},"reference":[{"key":"e_1_3_2_2_1_1","doi-asserted-by":"publisher","unstructured":"Siti Hawa Apandi Jamaludin Sallim Rozlina Mohamed and Araby Madbouly. 2021. Web Page Classification Using Convolutional Neural Network (CNN) Towards Eliminating Internet Addiction. In 2021 International Conference on Software Engineering Computer Systems and 4th International Conference on Computational Science and Information Management (ICSECS-ICOCSIM). 149--154. doi:10.1109\/ICSECS52883.2021.00034","DOI":"10.1109\/ICSECS52883.2021.00034"},{"key":"e_1_3_2_2_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/2677855.2677943"},{"key":"e_1_3_2_2_3_1","unstructured":"Maciej Besta Julia Barth Eric Schreiber Ales Kubicek Afonso Catarino Robert Gerstenberger Piotr Nyczyk Patrick Iff Yueling Li Sam Houliston et al. 2025. Reasoning Language Models: A Blueprint. arXiv preprint arXiv:2501.11223 (2025)."},{"key":"e_1_3_2_2_4_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.naacl-industry.10"},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/956863.956938"},{"key":"e_1_3_2_2_6_1","volume-title":"Faster and Lighter LLMs: A Survey on Current Challenges and Way Forward. arXiv preprint arXiv:2402.01799","author":"Chavan Arnav","year":"2024","unstructured":"Arnav Chavan, Raghav Magazine, Shubham Kushwaha, M\u00e9rouane Debbah, and Deepak Gupta. 2024. Faster and Lighter LLMs: A Survey on Current Challenges and Way Forward. arXiv preprint arXiv:2402.01799 (2024)."},{"key":"e_1_3_2_2_7_1","volume-title":"Domain-specific long text classification from sparse relevant information. arXiv preprint arXiv:2408.13253","author":"D'Cruz C\u00e9lia","year":"2024","unstructured":"C\u00e9lia D'Cruz, Jean-Marc Bereder, Fr\u00e9d\u00e9ric Precioso, and Michel Riveill. 2024. Domain-specific long text classification from sparse relevant information. arXiv preprint arXiv:2408.13253 (2024)."},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N19-1423"},{"key":"e_1_3_2_2_9_1","doi-asserted-by":"publisher","DOI":"10.1613\/jair.1523"},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/775047.775084"},{"key":"e_1_3_2_2_11_1","doi-asserted-by":"publisher","DOI":"10.5555\/647966.741464"},{"key":"e_1_3_2_2_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CONIT59222.2023.10205766"},{"key":"e_1_3_2_2_13_1","unstructured":"Aaron Grattafiori Abhimanyu Dubey Abhinav Jauhri Abhinav Pandey Abhishek Kadian Ahmad Al-Dahle Aiesha Letman Akhil Mathur Alan Schelten Alex Vaughan et al. 2024. The llama 3 herd of models. arXiv preprint arXiv:2407.21783 (2024)."},{"key":"e_1_3_2_2_14_1","unstructured":"Daya Guo Dejian Yang Haowei Zhang Junxiao Song Ruoyu Zhang Runxin Xu Qihao Zhu Shirong Ma Peiyi Wang Xiao Bi et al. 2025. Deepseek-r1: Incentivizing reasoning capability in llms via reinforcement learning. arXiv preprint arXiv:2501.12948 (2025)."},{"key":"e_1_3_2_2_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/1364782.1364798"},{"key":"e_1_3_2_2_16_1","volume-title":"Overview of web content mining tools. arXiv preprint arXiv:1307.1024","author":"Herrouz Abdelhakim","year":"2013","unstructured":"Abdelhakim Herrouz, Chabane Khentout, and Mahieddine Djoudi. 2013. Overview of web content mining tools. arXiv preprint arXiv:1307.1024 (2013)."},{"key":"e_1_3_2_2_17_1","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"e_1_3_2_2_18_1","volume-title":"Krisztian Flautner, Lingjia Tang, Yiping Kang, and Jason Mars.","author":"Irugalbandara Chandra","year":"2023","unstructured":"Chandra Irugalbandara, Ashish Mahendra, Roland Daynauth, Tharuka Kasthuri Arachchige, Krisztian Flautner, Lingjia Tang, Yiping Kang, and Jason Mars. 2023. Scaling Down to Scale Up: A Cost-Benefit Analysis of Replacing OpenAI's GPT-4 with Self-Hosted Open Source SLMs in Production. arXiv preprint arXiv:2312.14972 (2023)."},{"key":"e_1_3_2_2_19_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.findings-acl.330"},{"key":"e_1_3_2_2_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/355214.355216"},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"publisher","DOI":"10.3233\/SJI-200675"},{"key":"e_1_3_2_2_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICAIBD.2019.8837027"},{"key":"e_1_3_2_2_23_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.433"},{"key":"e_1_3_2_2_24_1","volume-title":"Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692","author":"Liu Yinhan","year":"2019","unstructured":"Yinhan Liu. 2019. Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692 (2019)."},{"key":"e_1_3_2_2_25_1","volume-title":"International Joint Conference on the Analysis of Images, Social Networks and Texts. https:\/\/api.semanticscholar.org\/CorpusID:17659714","author":"Moiseev George","year":"2016","unstructured":"George Moiseev. 2016. Classification of E-commerce Websites by Product Categories. In International Joint Conference on the Analysis of Images, Social Networks and Texts. https:\/\/api.semanticscholar.org\/CorpusID:17659714"},{"key":"e_1_3_2_2_26_1","volume-title":"Zero-shot prompt-based classification: topic labeling in times of foundation models in German Tweets. arXiv preprint arXiv:2406.18239","author":"M\u00fcnker Simon","year":"2024","unstructured":"Simon M\u00fcnker, Kai Kugler, and Achim Rettinger. 2024. Zero-shot prompt-based classification: topic labeling in times of foundation models in German Tweets. arXiv preprint arXiv:2406.18239 (2024)."},{"key":"e_1_3_2_2_27_1","doi-asserted-by":"publisher","DOI":"10.1080\/08963568.2015.1110229"},{"key":"e_1_3_2_2_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/1183614.1183650"},{"key":"e_1_3_2_2_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/1459352.1459357"},{"key":"e_1_3_2_2_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/3326467.3326486"},{"key":"e_1_3_2_2_31_1","volume-title":"C. Lee Giles, and Shomir Wilson.","author":"Srinath Mukund","year":"2021","unstructured":"Mukund Srinath, Soundarya Nurani Sundareswara, C. Lee Giles, and Shomir Wilson. 2021. PrivaSeer: A Privacy Policy Search Engine. In Web Engineering, Marco Brambilla, Richard Chbeir, Flavius Frasincar, and Ioana Manolescu (Eds.). Springer International Publishing, Cham, 286--301."},{"key":"e_1_3_2_2_32_1","volume-title":"LLMs are Also Effective Embedding Models: An In-depth Overview. arXiv preprint arXiv:2412.12591","author":"Tao Chongyang","year":"2024","unstructured":"Chongyang Tao, Tao Shen, Shen Gao, Junshuo Zhang, Zhen Li, Zhengwei Tao, and Shuai Ma. 2024. LLMs are Also Effective Embedding Models: An In-depth Overview. arXiv preprint arXiv:2412.12591 (2024)."},{"key":"e_1_3_2_2_33_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-32584-7_30"},{"key":"e_1_3_2_2_34_1","volume-title":"Attention is all you need. Advances in Neural Information Processing Systems","author":"Vaswani A","year":"2017","unstructured":"A Vaswani. 2017. Attention is all you need. Advances in Neural Information Processing Systems (2017)."},{"key":"e_1_3_2_2_35_1","doi-asserted-by":"crossref","unstructured":"Benjamin Warner Antoine Chaffin Benjamin Clavi\u00e9 Orion Weller Oskar Hall-str\u00f6m Said Taghadouini Alexis Gallagher Raja Biswas Faisal Ladhak Tom Aarsen et al. 2024. Smarter better faster longer: A modern bidirectional encoder for fast memory efficient and long context finetuning and inference. arXiv preprint arXiv:2412.13663 (2024).","DOI":"10.18653\/v1\/2025.acl-long.127"},{"key":"e_1_3_2_2_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/956750.956785"}],"event":{"name":"DocEng '25: ACM Symposium on Document Engineering 2025","sponsor":["SIGWEB ACM Special Interest Group on Hypertext, Hypermedia, and Web"],"location":"Nottingham United Kingdom","acronym":"DocEng '25"},"container-title":["Proceedings of the 2025 ACM Symposium on Document Engineering"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3704268.3742691","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,10,14]],"date-time":"2025-10-14T18:26:49Z","timestamp":1760466409000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3704268.3742691"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,8,27]]},"references-count":36,"alternative-id":["10.1145\/3704268.3742691","10.1145\/3704268"],"URL":"https:\/\/doi.org\/10.1145\/3704268.3742691","relation":{},"subject":[],"published":{"date-parts":[[2025,8,27]]},"assertion":[{"value":"2025-08-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}