{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,23]],"date-time":"2025-10-23T17:07:26Z","timestamp":1761239246418,"version":"3.44.0"},"reference-count":35,"publisher":"Springer Science and Business Media LLC","issue":"5","license":[{"start":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T00:00:00Z","timestamp":1750204800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0"},{"start":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T00:00:00Z","timestamp":1750204800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["J. King Saud Univ. Comput. Inf. Sci."],"published-print":{"date-parts":[[2025,7]]},"DOI":"10.1007\/s44443-025-00071-w","type":"journal-article","created":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T08:51:34Z","timestamp":1750236694000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Automatic XPath generation agents for vertical websites by LLMs"],"prefix":"10.1007","volume":"37","author":[{"ORCID":"https:\/\/orcid.org\/0009-0001-5936-367X","authenticated-orcid":false,"given":"Jing","family":"Huang","sequence":"first","affiliation":[]},{"given":"Jie","family":"Song","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,6,18]]},"reference":[{"key":"71_CR1","doi-asserted-by":"publisher","unstructured":"Aggarwal M, Gupta H, Sarkar M, Krishnamurthy B (2020) Form2Seq : a framework for higher-order form structure extraction. In: Proceedings of the 2020 conference on empirical methods in natural language processing (EMNLP). Association for Computational Linguistics, pp 3830\u20133840. https:\/\/aclanthology.org\/2020.emnlp-main.314. https:\/\/doi.org\/10.18653\/v1\/2020.emnlp-main.314","DOI":"10.18653\/v1\/2020.emnlp-main.314"},{"key":"71_CR2","doi-asserted-by":"crossref","unstructured":"Appalaraju S, Jasani B, Kota BU, Xie Y, Manmatha R (2021) DocFormer: end-to-end transformer for document understanding. In: 2021 IEEE\/CVF international conference on computer vision (ICCV), pp 973\u2013983. https:\/\/api.semanticscholar.org\/CorpusID:235592814","DOI":"10.1109\/ICCV48922.2021.00103"},{"key":"71_CR3","unstructured":"Dhananjay A, Lipton ZC (2023) PromptNER: prompting for named entity recognition. arXiv. https:\/\/arxiv.org\/abs\/2305.15444"},{"key":"71_CR4","doi-asserted-by":"crossref","unstructured":"\u00a0Bai F, Kang J, Stanovsky G, Freitag D, Dredze M, Ritter A (2023) Schema-driven information extraction from heterogeneous tables. arXiv. https:\/\/arxiv.org\/abs\/2305.14336","DOI":"10.18653\/v1\/2024.findings-emnlp.600"},{"key":"71_CR5","doi-asserted-by":"publisher","unstructured":"B\u00f6l\u00fcc\u00fc N, Maciej R, Wan S (2023) Impact of sample selection on in-context learning for entity extraction from scientific writing. In: Findings of the association for computational linguistics: EMNLP 2023. Association for Computational Linguistics, Singapore, pp 5090\u20135107. https:\/\/aclanthology.org\/2023.findings-emnlp.338. https:\/\/doi.org\/10.18653\/v1\/2023.findings-emnlp.338","DOI":"10.18653\/v1\/2023.findings-emnlp.338"},{"key":"71_CR6","doi-asserted-by":"publisher","unstructured":"Cheema SM, Tariq S, Pires IM (2023) A natural language interface for automatic generation of data flow diagram using web extraction techniques. J King Saud Univ \u2013 Comput Inf Sci 35(2):626\u2013640. https:\/\/doi.org\/10.1016\/j.jksuci.2023.01.006","DOI":"10.1016\/j.jksuci.2023.01.006"},{"key":"71_CR7","unstructured":"DeepSeek-AI (2024) DeepSeek-V2: a strong, economical, and efficient mixture-of-experts language model. arXiv. https:\/\/arxiv.org\/abs\/2405.04434"},{"key":"71_CR8","unstructured":"Deng X, Gu Y, Zheng B, Chen S, Stevens S, Wang B, Sun H, Su Y (2023) Mind2Web: towards a generalist agent for the web. arXiv. https:\/\/arxiv.org\/abs\/2306.06070"},{"key":"71_CR9","doi-asserted-by":"publisher","first-page":"301","DOI":"10.1016\/j.knosys.2014.07.007","volume":"70","author":"E Ferrara","year":"2014","unstructured":"Ferrara E, De Meo P, Fiumara G, Baumgartner R (2014) Web data extraction, applications and techniques: a survey. Knowl-Based Syst 70:301\u2013323","journal-title":"Knowl-Based Syst"},{"key":"71_CR10","unstructured":"Gan C, Zhang Q, Mori T (2023) GIELLM: Japanese general information extraction large language model utilizing mutual reinforcement effect. arXiv https:\/\/arxiv.org\/abs\/2311.06838"},{"key":"71_CR11","doi-asserted-by":"crossref","unstructured":"Gonz\u00e1lez-Gallardo C-E, Boros E, Girdhar N, Hamdi A, Moreno JG, Doucet A (2023) Yes but.. can ChatGPT identify entities in historical documents? In: 2023 ACM\/IEEE joint conference on digital libraries (JCDL), pp 184\u2013189 https:\/\/api.semanticscholar.org\/CorpusID:257833707","DOI":"10.1109\/JCDL57899.2023.00034"},{"key":"71_CR12","doi-asserted-by":"crossref","unstructured":"Hao Q, Cai R, Pang Y, Zhang L (2011) From one tree to a forest: a unified solution for structured web data extraction. In: Proceedings of the 34th international ACM SIGIR conference on research and development in Information Retrieval. https:\/\/api.semanticscholar.org\/CorpusID:17002481","DOI":"10.1145\/2009916.2010020"},{"key":"71_CR13","doi-asserted-by":"publisher","unstructured":"Heng Y, Deng C, Li Y, Yu Y, Li Y, Zhang R, Zhang C (2024) ProgGen: generating named entity recognition datasets step-by-step with self-reflexive large language models. In: Findings of the association for computational linguistics: ACL 2024. Association for Computational Linguistics, Bangkok, pp 15992\u201316030. https:\/\/aclanthology.org\/2024.findings-acl.947.\u00a0 https:\/\/doi.org\/10.18653\/v1\/2024.findings-acl.947","DOI":"10.18653\/v1\/2024.findings-acl.947"},{"key":"71_CR14","unstructured":"\u00a0Hong Z, Chard K, Foster I (2024) Combining language and graph models for semi-structured information extraction on the web, arXiv. https:\/\/arxiv.org\/abs\/2402.14129"},{"key":"71_CR15","doi-asserted-by":"publisher","first-page":"1812","DOI":"10.1093\/jamia\/ocad259","volume":"31","author":"Y Hu","year":"2023","unstructured":"Hu Y, Ameer I, Zuo X, Peng X, Zhou Y, Li Z, Li Y, Li J, Jiang X, Xu H (2023) Improving large language models for clinical named entity recognition via prompt engineering. J Am Med Inform Assoc: JAMIA 31:1812\u20131820","journal-title":"J Am Med Inform Assoc: JAMIA"},{"key":"71_CR16","doi-asserted-by":"crossref","unstructured":"\u00a0Huang W, Peng C, Li Z, Liang J, Xiao Y, Wen L, Chen Z (2024) AUTOSCRAPER: a progressive understanding web agent for web crawler generation. In: Proceedings of the 2024 conference on empirical methods in natural language processing (EMNLP)","DOI":"10.18653\/v1\/2024.emnlp-main.141"},{"key":"71_CR17","unstructured":"Jiang A, Sablayrolles A, Mensch A, Bamford C, Chaplot D, de Las Casas D, Bressand F, Lengyel G, Lample G, Saulnier L, Lavaud L, Lachaux M, Stock P, Scao T, Lavril T, Wang T, Lacroix T, Sayed W (2023) Mistral of experts. arXiv https:\/\/arxiv.org\/abs\/2310.06825"},{"key":"71_CR18","doi-asserted-by":"publisher","first-page":"1929","DOI":"10.1093\/jamia\/ocae095","volume":"31","author":"M Li","year":"2024","unstructured":"Li M, Zhou H, Yang H, Zhang R (2024) Rt: a retrieving and chain-of-thought framework for few-shot medical named entity recognition. J Am Med Inform Assoc\u202f: JAMIA 31:1929\u20131938","journal-title":"J Am Med Inform Assoc : JAMIA"},{"key":"71_CR19","doi-asserted-by":"crossref","unstructured":"Li J, Xu Y, Cui L, Wei F (2021) MarkupLM: pre-training of text and markup language for visually rich document understanding. In: Annual meeting of the association for computational linguistics, pp 6078\u20136087. https:\/\/api.semanticscholar.org\/CorpusID:239015946","DOI":"10.18653\/v1\/2022.acl-long.420"},{"key":"71_CR20","doi-asserted-by":"crossref","unstructured":"Lin BY, Sheng Y, Vo N, Tata S (2020) FreeDOM: a transferable neural architecture for structured information extraction on web documents. In: Proceedings of the 26th ACM SIGKDD international conference on knowledge discovery & data mining. https:\/\/api.semanticscholar.org\/CorpusID:221191345","DOI":"10.1145\/3394486.3403153"},{"key":"71_CR21","doi-asserted-by":"crossref","unstructured":"Lockard C, Shiralkar P, Dong XL (2019) Openceres: when open information extraction meets the semi-structured web. In: Proceedings of the 2019 conference of the north American chapter of the association for computational linguistics: human language technologies, pp 3047\u20133056. https:\/\/aclanthology.org\/N19-1309","DOI":"10.18653\/v1\/N19-1309"},{"key":"71_CR22","doi-asserted-by":"publisher","unstructured":"Lockard C, Shiralkar P, Dong XL, Hajishirzi H (2020) ZeroShotCeres: zero-shot relation extraction from semi-structured webpages. In: Proceedings of the 58th annual meeting of the association for computational linguistics. Association for Computational Linguistics, pp 8105\u20138117. https:\/\/doi.org\/10.18653\/v1\/2020.acl-main.721","DOI":"10.18653\/v1\/2020.acl-main.721"},{"key":"71_CR23","doi-asserted-by":"crossref","unstructured":"Omari A, Shoham S, Yahav E (2017) Synthesis of forgiving data extractors. In: Proceedings of the tenth ACM international conference on web search and data mining","DOI":"10.1145\/3018661.3018740"},{"key":"71_CR24","unstructured":"OpenAI (2023) GPT-4 technical report. arXiv https:\/\/arxiv.org\/abs\/2303.08774"},{"key":"71_CR25","unstructured":"Rae JW, Borgeaud S, Cai T, Millican K, Hoffmann J, Song F, Aslanides J, Henderson S, Ring R, Young S, Rutherford E, Hennigan T, Menick J, Cassirer A, Powell R, van den Driessche G, Hendricks LA, Rauh M, Huang P-S, Glaese A, Welbl J, Dathathri S, Huang S, Uesato J, Mellor JFJ, Higgins I, Creswell A, McAleese N, Wu A, Elsen E, Jayakumar SM, Buchatskaya E, Budden D, Sutherland E, Simonyan K, Paganini M, Sifre L, Martens L, Li XL, Kuncoro A, Nematzadeh A, Gribovskaya E, Donato D, Lazaridou A, Mensch A, Lespiau J-B, Tsimpoukelli M, Grigorev NK, Fritz D, Sottiaux T, Pajarskas M, Pohlen T, Gong Z, Toyama D, de Masson d'Autume C, Li Y, Terzi T, Mikulik V, Babuschkin I, Clark A, de Las Casas D, Guy A, Jones C, Bradbury J, Johnson MG, Hechtman BA, Weidinger L, Gabriel I, Isaac WS, Lockhart E, Osindero S, Rimell L, Dyer C, Vinyals O, Ayoub KW, Stanway J, Bennett LL, Hassabis D, Kavukcuoglu K, Irving G (2021) Scaling language models: methods, analysis & insights from training gopher. arXiv. https:\/\/arxiv.org\/abs\/2112.11446"},{"key":"71_CR26","doi-asserted-by":"crossref","unstructured":"Rajpurkar P, Zhang J, Lopyrev K, Liang P (2016) SQuAD: 100,000+ questions for machine comprehension of text. In: Conference on empirical methods in natural language processing, pp 2383\u20132392. https:\/\/api.semanticscholar.org\/CorpusID:11816014","DOI":"10.18653\/v1\/D16-1264"},{"key":"71_CR27","doi-asserted-by":"crossref","unstructured":"Shao W, Zhang R, Ji P, Di F, Hu Y, Xn Y, Cu C, Tao Y, Mi L, Chen L (2023) Astronomical knowledge entity extraction in astrophysics journal articles via large language models. Res Astron Astrophys 24. https:\/\/api.semanticscholar.org\/CorpusID:264555280","DOI":"10.1088\/1674-4527\/ad3d15"},{"key":"71_CR28","unstructured":"Shinn N, Cassano F, Labash B, Gopinath A, Narasimhan K, Yao S (2023) Reflexion: language agents with verbal reinforcement learning. Neural Inf Process Syst:8634\u20138652.\u00a0\u00a0https:\/\/api.semanticscholar.org\/CorpusID:258833055"},{"key":"71_CR29","doi-asserted-by":"publisher","unstructured":"Wang C, Xi L, Chen Z, Hong H, Tang J, Song D (2022a) DeepStruct: pretraining of language models for structure prediction. In: Findings of the association for computational linguistics: ACL 2022, pp 803\u2013823. https:\/\/aclanthology.org\/2022.findings-acl.67. https:\/\/doi.org\/10.18653\/v1\/2022.findings-acl.67","DOI":"10.18653\/v1\/2022.findings-acl.67"},{"key":"71_CR30","doi-asserted-by":"crossref","unstructured":"Wang Q, Fang Y, Ravula A, Feng F, Quan X, Liu D (2022) Webformer: The web-page transformer for structure information extraction. In: Proceedings of the ACM Web Conference 2022. WWW \u201922, pp. 3124\u20133133. Association for Computing Machinery, New York, NY, USA","DOI":"10.1145\/3485447.3512032"},{"key":"71_CR31","unstructured":"Wang S, Sun X, Li X, Ouyang R, Wu F, Zhang T, Li J, Wang G (2023) GPT-NER: named entity recognition via large language models. arXiv https:\/\/arxiv.org\/abs\/2304.10428"},{"key":"71_CR32","unstructured":"Wei J, Wang X, Schuurmans D, Bosma M, Ichter B, Xia F, Chi E, Le Q, Zhou D (2022) Chain of thought prompting elicits reasoning in large language models. arXiv. https:\/\/arxiv.org\/abs\/2201.11903"},{"key":"71_CR33","doi-asserted-by":"publisher","unstructured":"Yuan C, Xie Q, Ananiadou S (2023) Zero-shot temporal relation extraction with ChatGPT. In: The 22nd workshop on biomedical natural language processing and BioNLP shared tasks. Association for Computational Linguistics, Toronto, pp 92\u2013102. https:\/\/doi.org\/10.18653\/v1\/2023.bionlp-1.7","DOI":"10.18653\/v1\/2023.bionlp-1.7"},{"key":"71_CR34","doi-asserted-by":"crossref","unstructured":"Zheng G, Mukherjee S, Dong XL, Li F (2018) OpenTag: open attribute value extraction from product profiles. In: Proceedings of the 24th ACM SIGKDD international conference on knowledge discovery & data mining. https:\/\/api.semanticscholar.org\/CorpusID:46939981","DOI":"10.1145\/3219819.3219839"},{"key":"71_CR35","unstructured":"Zhou Y, Sheng Y, Vo N, Edmonds N, Tata S (2021) Simplified DOM trees for transferable attribute extraction from the web. arXiv"}],"container-title":["Journal of King Saud University Computer and Information Sciences"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s44443-025-00071-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s44443-025-00071-w\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s44443-025-00071-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,6]],"date-time":"2025-09-06T20:42:53Z","timestamp":1757191373000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s44443-025-00071-w"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,18]]},"references-count":35,"journal-issue":{"issue":"5","published-print":{"date-parts":[[2025,7]]}},"alternative-id":["71"],"URL":"https:\/\/doi.org\/10.1007\/s44443-025-00071-w","relation":{},"ISSN":["1319-1578","2213-1248"],"issn-type":[{"type":"print","value":"1319-1578"},{"type":"electronic","value":"2213-1248"}],"subject":[],"published":{"date-parts":[[2025,6,18]]},"assertion":[{"value":"15 January 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"15 May 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"18 June 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that there are no known competing financial interests or personal relationships that could have influenced the work reported in this paper.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Competing interest"}}],"article-number":"74"}}