{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,14]],"date-time":"2026-05-14T04:19:57Z","timestamp":1778732397736,"version":"3.51.4"},"reference-count":37,"publisher":"American Chemical Society (ACS)","issue":"8","license":[{"start":{"date-parts":[[2024,3,28]],"date-time":"2024-03-28T00:00:00Z","timestamp":1711584000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"DOI":"10.13039\/501100000287","name":"Royal Academy of Engineering","doi-asserted-by":"publisher","award":["RCSRF1819\/7\/10"],"award-info":[{"award-number":["RCSRF1819\/7\/10"]}],"id":[{"id":"10.13039\/501100000287","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100004349","name":"BASF","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100004349","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100000271","name":"Science and Technology Facilities Council","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100000271","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["J. Chem. Inf. Model."],"published-print":{"date-parts":[[2024,4,22]]},"DOI":"10.1021\/acs.jcim.4c00063","type":"journal-article","created":{"date-parts":[[2024,3,28]],"date-time":"2024-03-28T05:42:02Z","timestamp":1711604522000},"page":"3205-3212","source":"Crossref","is-referenced-by-count":7,"title":["How Beneficial Is Pretraining on a Narrow Domain-Specific Corpus for Information Extraction about Photocatalytic Water Splitting?"],"prefix":"10.1021","volume":"64","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-0475-403X","authenticated-orcid":true,"given":"Taketomo","family":"Isazawa","sequence":"first","affiliation":[{"name":"Cavendish Laboratory, Department of Physics, University of Cambridge, J. J. Thomson Avenue, Cambridge CB3 0HE, U.K."}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1552-8743","authenticated-orcid":true,"given":"Jacqueline M.","family":"Cole","sequence":"additional","affiliation":[{"name":"Cavendish Laboratory, Department of Physics, University of Cambridge, J. J. Thomson Avenue, Cambridge CB3 0HE, U.K."},{"name":"ISIS Neutron and Muon Source, STFC Rutherford Appleton Laboratory, Harwell Science and Innovation Campus, Didcot, Oxfordshire OX11 0QX, U.K."}]}],"member":"316","published-online":{"date-parts":[[2024,3,28]]},"reference":[{"key":"ref1\/cit1","unstructured":"Beltagy, I.; Cohan, A.; Lo, K. SciBERT: Pretrained Contextualized Embeddings for Scientific Text. 2019, http:\/\/arxiv.org\/abs\/1903.10676."},{"key":"ref2\/cit2","doi-asserted-by":"publisher","DOI":"10.1021\/acs.jcim.2c00035"},{"key":"ref3\/cit3","doi-asserted-by":"publisher","DOI":"10.1093\/bioinformatics\/btz682"},{"key":"ref4\/cit4","doi-asserted-by":"publisher","DOI":"10.1021\/acs.jcim.2c01259"},{"key":"ref5\/cit5","doi-asserted-by":"publisher","DOI":"10.1038\/s41524-022-00784-w"},{"key":"ref6\/cit6","unstructured":"Sanchez, C.; Zhang, Z. The Effects of In-domain Corpus Size on pre-training BERT. 2022, arXiv:2212.07914."},{"key":"ref7\/cit7","doi-asserted-by":"crossref","unstructured":"Li, X.; Yin, F.; Sun, Z.; Li, X.; Yuan, A.; Chai, D.; Zhou, M.; Li, J. Entity-Relation Extraction as Multi-Turn Question Answering. 2019, arXiv preprint arXiv:1905.05529.","DOI":"10.18653\/v1\/P19-1129"},{"key":"ref8\/cit8","doi-asserted-by":"publisher","DOI":"10.1039\/D2SC04322J"},{"key":"ref9\/cit9","doi-asserted-by":"publisher","DOI":"10.1038\/s41597-023-02511-6"},{"key":"ref10\/cit10","unstructured":"Devlin, J.; Chang, M.; Lee, K.; Toutanova, K. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. 2018, arXiv preprint arXiv:1810.04805."},{"key":"ref11\/cit11","doi-asserted-by":"publisher","DOI":"10.1021\/acs.jcim.6b00207"},{"key":"ref12\/cit12","doi-asserted-by":"publisher","DOI":"10.1021\/acs.jcim.1c00446"},{"key":"ref13\/cit13","doi-asserted-by":"crossref","unstructured":"Lo, K.; Wang, L. L.; Neumann, M.; Kinney, R.; Weld, D. S2ORC: The Semantic Scholar Open Research Corpus. In  Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics; Association for Computational Linguistics, 2020, pp 4969\u20134983.","DOI":"10.18653\/v1\/2020.acl-main.447"},{"key":"ref14\/cit14","unstructured":"Liu, Y.; Ott, M.; Goyal, N.; Du, J.; Joshi, M.; Chen, D.; Levy, O.; Lewis, M.; Zettlemoyer, L.; Stoyanov, V. RoBERTa: A Robustly Optimized BERT Pretraining Approach. 2019, arXiv preprint arXiv:1907.11692."},{"key":"ref15\/cit15","unstructured":"Wu, Y.; Schuster, M.; Chen, Z.; Le, Q. V.; Norouzi, M.; Macherey, W.; Krikun, M.; Cao, Y.; Gao, Q.; Macherey, K.; Klingner, J.; Shah, A.; Johnson, M.; Liu, X.; Kaiser, L.; Gouws, S.; Kato, Y.; Kudo, T.; Kazawa, H.; Stevens, K.; Kurian, G.; Patil, N.; Wang, W.; Young, C.; Smith, J.; Riesa, J.; Rudnick, A.; Vinyals, O.; Corrado, G.; Hughes, M.; Dean, J. Google\u2019s Neural Machine Translation System: Bridging the Gap between Human and Machine Translation. 2016, arXiv preprint arXiv:1609.08144."},{"key":"ref16\/cit16","unstructured":"HuggingFace Tokenizers. 2022. https:\/\/github.com\/huggingface\/tokenizers (accessed March 29, 2023)."},{"key":"ref17\/cit17","doi-asserted-by":"publisher","DOI":"10.1038\/s41597-020-00602-2"},{"key":"ref18\/cit18","doi-asserted-by":"crossref","unstructured":"Wolf, T.; Debut, L.; Sanh, V.; Chaumond, J.; Delangue, C.; Moi, A.; Cistac, P.; Rault, T.; Louf, R.; Funtowicz, M.; Brew, J. HuggingFace\u2019s Transformers: State-of-the-art Natural Language Processing. 2019, arXiv preprint arXiv:1910.03771.","DOI":"10.18653\/v1\/2020.emnlp-demos.6"},{"key":"ref19\/cit19","doi-asserted-by":"crossref","unstructured":"Rajbhandari, S.; Rasley, J.; Ruwase, O.; He, Y. ZeRO: Memory Optimizations Toward Training Trillion Parameter Models. In  SC20: International Conference for High Performance Computing, Networking, Storage and Analysis; IEEE, 2020, pp 1\u201316.","DOI":"10.1109\/SC41405.2020.00024"},{"key":"ref20\/cit20","doi-asserted-by":"crossref","unstructured":"Rajpurkar, P.; Jia, R.; Liang, P. Know What You Don\u2019t Know: Unanswerable Questions for SQuAD. 2018, arXiv preprint arXiv:1806.03822.","DOI":"10.18653\/v1\/P18-2124"},{"key":"ref21\/cit21","doi-asserted-by":"publisher","DOI":"10.1039\/c2nr30819c"},{"key":"ref22\/cit22","doi-asserted-by":"crossref","unstructured":"Polak, M. P.; Morgan, D. Extracting Accurate Materials Data from Research Papers with Conversational Language Models and Prompt Engineering. 2023; http:\/\/arxiv.org\/abs\/2303.05352.","DOI":"10.1038\/s41467-024-45914-8"},{"key":"ref23\/cit23","unstructured":"Dunn, A.; Dagdelen, J.; Walker, N.; Lee, S.; Rosen, A. S.; Ceder, G.; Persson, K.; Jain, A. Structured information extraction from complex scientific text with fine-tuned large language models. 2022; http:\/\/arxiv.org\/abs\/2212.05238."},{"key":"ref24\/cit24","first-page":"5776","volume":"33","author":"Wang W.","year":"2020","journal-title":"Adv. Neural Inf. Process Syst."},{"key":"ref25\/cit25","unstructured":"He, P.; Gao, J.; Chen, W. DeBERTaV3: Improving DeBERTa using ELECTRA-Style Pre-Training with Gradient-Disentangled Embedding Sharing. 2021, arXiv preprint arXiv:2111.09543."},{"key":"ref26\/cit26","unstructured":"deepset\/bert-base-uncased-squad2\u00b7Hugging Face \u2013 huggingface.co. https:\/\/huggingface.co\/deepset\/bert-base-uncased-squad2 (accessed\nApril 21, 2023)."},{"key":"ref27\/cit27","unstructured":"deepset\/deberta-v3-large-squad2\u00b7Hugging\nFace \u2013 huggingface.co. https:\/\/huggingface.co\/deepset\/deberta-v3-large-squad2 (accessed\nApril 24, 2023)."},{"key":"ref28\/cit28","doi-asserted-by":"crossref","unstructured":"Strubell, E.; Ganesh, A.; McCallum, A. Energy and Policy Considerations for Deep Learning in NLP. 2019, arXiv preprint arXiv:1906.02243.","DOI":"10.18653\/v1\/P19-1355"},{"key":"ref29\/cit29","first-page":"19","volume-title":"Proceedings of the IEEE international conference on computer vision","author":"Zhu Y.","year":"2015"},{"key":"ref30\/cit30","unstructured":"Gao, L.; Biderman, S.; Black, S.; Golding, L.; Hoppe, T.; Foster, C.; Phang, J.; He, H.; Thite, A.; Nabeshima, N.; Presser, S.; Leahy, C. The Pile: An 800GB Dataset of Diverse Text for Language Modeling. 2021, arXiv:2101.00027."},{"key":"ref31\/cit31","unstructured":"Hoffmann, J.; Borgeaud, S.; Mensch, A.; Buchatskaya, E.; Cai, T.; Rutherford, E.; de Las Casas, D.; Hendricks, L. A.; Welbl, J.; Clark, A.; Hennigan, T.; Noland, E.; Millican, K.; van den Driessche, G.; Damoc, B.; Guy, A.; Osindero, S.; Simonyan, K.; Elsen, E.; Rae, J. W.; Vinyals, O.; Sifre, L. Training Compute-Optimal Large Language Models. 2022, arXiv preprint arXiv:2203."},{"key":"ref32\/cit32","unstructured":"ti250\/photocatalysis_bert_training_scripts. https:\/\/github.com\/ti250\/photocatalysis_bert_training_scripts (accessed March 2, 2024)."},{"key":"ref33\/cit33","unstructured":"ti250\/e2e_workflow. https:\/\/github.com\/ti250\/e2e_workflow (accessed March 2, 2024)."},{"key":"ref34\/cit34","unstructured":"ti250\/photocatalysis_bert_kr. https:\/\/github.com\/ti250\/photocatalysis_bert_kr (accessed March 2, 2024)."},{"key":"ref35\/cit35","unstructured":"ti250\/photocatalysis_bert_dataset. https:\/\/github.com\/ti250\/photocalysis_bert_dataset (accessed\nMarch 2, 2024)."},{"key":"ref36\/cit36","unstructured":"CambridgeMolecularEngineering\/chemdataextractor2:\nChemDataExtractor\nVersion 2.2. https:\/\/github.com\/CambridgeMolecularEngineering\/chemdataextractor2 (accessed September 16, 2023)."},{"key":"ref37\/cit37","unstructured":"Cambridge\nMolecular Engineering (Molecular Engineering). https:\/\/huggingface.co\/CambridgeMolecularEngineering (accessed\nMarch 2, 2024)."}],"container-title":["Journal of Chemical Information and Modeling"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/pubs.acs.org\/doi\/pdf\/10.1021\/acs.jcim.4c00063","content-type":"application\/pdf","content-version":"vor","intended-application":"unspecified"},{"URL":"https:\/\/pubs.acs.org\/doi\/pdf\/10.1021\/acs.jcim.4c00063","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,4,22]],"date-time":"2024-04-22T08:12:21Z","timestamp":1713773541000},"score":1,"resource":{"primary":{"URL":"https:\/\/pubs.acs.org\/doi\/10.1021\/acs.jcim.4c00063"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,3,28]]},"references-count":37,"journal-issue":{"issue":"8","published-print":{"date-parts":[[2024,4,22]]}},"alternative-id":["10.1021\/acs.jcim.4c00063"],"URL":"https:\/\/doi.org\/10.1021\/acs.jcim.4c00063","relation":{},"ISSN":["1549-9596","1549-960X"],"issn-type":[{"value":"1549-9596","type":"print"},{"value":"1549-960X","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,3,28]]}}}