{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,24]],"date-time":"2026-06-24T14:57:52Z","timestamp":1782313072776,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":11,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,29]],"date-time":"2024-10-29T00:00:00Z","timestamp":1730160000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"name":"UKRI EPSRC","award":["EP\/Y528651\/1"],"award-info":[{"award-number":["EP\/Y528651\/1"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,29]]},"DOI":"10.1145\/3678717.3691215","type":"proceedings-article","created":{"date-parts":[[2024,11,22]],"date-time":"2024-11-22T06:29:21Z","timestamp":1732256961000},"page":"693-696","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["CC-GPX: Extracting High-Quality Annotated Geospatial Data from Common Crawl"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-7082-7122","authenticated-orcid":false,"given":"Ilya","family":"Ilyankou","sequence":"first","affiliation":[{"name":"UCL SpaceTimeLab, London, UK"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8420-2141","authenticated-orcid":false,"given":"Meihui","family":"Wang","sequence":"additional","affiliation":[{"name":"UCL SpaceTimeLab, London, UK"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3575-0365","authenticated-orcid":false,"given":"Stefano","family":"Cavazzi","sequence":"additional","affiliation":[{"name":"Ordnance Survey, Southampton, UK"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9506-4266","authenticated-orcid":false,"given":"James","family":"Haworth","sequence":"additional","affiliation":[{"name":"UCL SpaceTimeLab, London, UK"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2024,11,22]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Meta AI. 2024. Introducing Meta Llama 3: The most capable openly available LLM to date. https:\/\/ai.meta.com\/blog\/meta-llama-3\/"},{"key":"e_1_3_2_1_2_1","volume-title":"Training Data for the Price of a Sandwich: Common Crawl's Impact on Generative AI. (Feb","author":"Baack Stefan","year":"2024","unstructured":"Stefan Baack. 2024. Training Data for the Price of a Sandwich: Common Crawl's Impact on Generative AI. (Feb. 2024)."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"crossref","unstructured":"Asier Guti\u00e9rrez-Fandi\u00f1o David P\u00e9rez-Fern\u00e1ndez Jordi Armengol-Estap\u00e9 David Griol and Zoraida Callejas. 2022. esCorpius: A Massive Spanish Crawling Corpus. http:\/\/arxiv.org\/abs\/2206.15147 arXiv:2206.15147 [cs].","DOI":"10.21437\/IberSPEECH.2022-26"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","unstructured":"Ilya Ilyankou Meihui Wang Stefano Cavazzi and James Haworth. 2024. Quantifying Geospatial in the Common Crawl Corpus. https:\/\/doi.org\/10.48550\/arXiv.2406.04952 arXiv:2406.04952 [cs].","DOI":"10.48550\/arXiv.2406.04952"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.3390\/s21113687"},{"key":"e_1_3_2_1_6_1","volume-title":"Viviano","author":"Luccioni Alexandra Sasha","year":"2021","unstructured":"Alexandra Sasha Luccioni and Joseph D. Viviano. 2021. What's in the Box? A Preliminary Analysis of Undesirable Content in the Common Crawl Corpus. http:\/\/arxiv.org\/abs\/2105.02732 arXiv:2105.02732 [cs]."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.5167\/uzh-80038"},{"key":"e_1_3_2_1_8_1","volume-title":"GPT-2, GPT-3, GPT-NeoX-20B, Megatron-11B, MT-NLG, and Gopher.","author":"Thompson Alan D","year":"2022","unstructured":"Alan D Thompson. 2022. What's in my AI? A Comprehensive Analysis of Datasets Used to Train GPT-1, GPT-2, GPT-3, GPT-NeoX-20B, Megatron-11B, MT-NLG, and Gopher. (2022)."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-41682-8_22"},{"key":"e_1_3_2_1_10_1","volume-title":"Rory M Butler, Anton Alexandrov, Valdemar R Thanner, Georgios Tsolakis, Haris Jabbar, Ian Foster, Bo Li, and Rick Stevens.","author":"Weber Maurice","year":"2023","unstructured":"Maurice Weber, Carlo Siebenschuh, Rory M Butler, Anton Alexandrov, Valdemar R Thanner, Georgios Tsolakis, Haris Jabbar, Ian Foster, Bo Li, and Rick Stevens. 2023. WordScape: a Pipeline to extract multilingual, visually rich Documents with Layout Annotations from Web Crawl Data. (2023)."},{"key":"e_1_3_2_1_11_1","unstructured":"Wenhao Zhu Hongyi Liu Qingxiu Dong Jingjing Xu Shujian Huang Lingpeng Kong Jiajun Chen and Lei Li. 2023. Multilingual Machine Translation with Large Language Models: Empirical Results and Analysis. http:\/\/arxiv.org\/abs\/2304.04675 arXiv:2304.04675 [cs]."}],"event":{"name":"SIGSPATIAL '24: The 32nd ACM International Conference on Advances in Geographic Information Systems","location":"Atlanta GA USA","acronym":"SIGSPATIAL '24","sponsor":["SIGSPATIAL ACM Special Interest Group on Spatial Information"]},"container-title":["Proceedings of the 32nd ACM International Conference on Advances in Geographic Information Systems"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3678717.3691215","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3678717.3691215","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T10:41:09Z","timestamp":1755859269000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3678717.3691215"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,29]]},"references-count":11,"alternative-id":["10.1145\/3678717.3691215","10.1145\/3678717"],"URL":"https:\/\/doi.org\/10.1145\/3678717.3691215","relation":{},"subject":[],"published":{"date-parts":[[2024,10,29]]},"assertion":[{"value":"2024-11-22","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}