{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,30]],"date-time":"2026-01-30T03:54:01Z","timestamp":1769745241662,"version":"3.49.0"},"reference-count":30,"publisher":"Elsevier BV","issue":"1","license":[{"start":{"date-parts":[[2018,6,1]],"date-time":"2018-06-01T00:00:00Z","timestamp":1527811200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2018,6,1]],"date-time":"2018-06-01T00:00:00Z","timestamp":1527811200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2018,5,22]],"date-time":"2018-05-22T00:00:00Z","timestamp":1526947200000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/creativecommons.org\/licenses\/by-nc-nd\/3.0\/"}],"funder":[{"DOI":"10.13039\/100000002","name":"National Institutes of Health","doi-asserted-by":"publisher","award":["P01AG03934"],"award-info":[{"award-number":["P01AG03934"]}],"id":[{"id":"10.13039\/100000002","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100000002","name":"National Institutes of Health","doi-asserted-by":"publisher","award":["R01LM10817"],"award-info":[{"award-number":["R01LM10817"]}],"id":[{"id":"10.13039\/100000002","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Data and Information Management"],"published-print":{"date-parts":[[2018,6]]},"DOI":"10.2478\/dim-2018-0004","type":"journal-article","created":{"date-parts":[[2018,5,23]],"date-time":"2018-05-23T18:16:42Z","timestamp":1527099402000},"page":"27-36","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":3,"title":["Design of a generic, open platform for machine learning-assisted indexing and clustering of articles in PubMed, a biomedical bibliographic database"],"prefix":"10.1016","volume":"2","author":[{"given":"Neil R.","family":"Smalheiser","sequence":"first","affiliation":[]},{"given":"Aaron M.","family":"Cohen","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.2478\/dim-2018-0004_bib001","series-title":"Mining text data","first-page":"465","article-title":"Biomedical text mining: a survey of recent progress","author":"Simpson","year":"2012"},{"issue":"0","key":"10.2478\/dim-2018-0004_bib002","article-title":"Text mining resources for the life sciences","volume":"2016","author":"Przyby\u0142a","year":"2016","journal-title":"Database"},{"key":"10.2478\/dim-2018-0004_bib003","first-page":"55","article-title":"The Stanford CoreNLP natural language processing toolkit","author":"Manning","year":"2014","journal-title":"ACL (System Demonstrations)"},{"issue":"5","key":"10.2478\/dim-2018-0004_bib004","doi-asserted-by":"crossref","first-page":"507","DOI":"10.1136\/jamia.2009.001560","article-title":"Mayo clinical Text Analysis and Knowledge Extraction System (cTAKES): architecture, component evaluation and applications","volume":"17","author":"Savova","year":"2010","journal-title":"Journal of the American Medical Informatics Association: JAMIA"},{"key":"10.2478\/dim-2018-0004_bib005","doi-asserted-by":"crossref","DOI":"10.1093\/database\/baw066","article-title":"Argo: enabling the development of bespoke workflows and services for disease annotation","author":"Batista-Navarro","year":"2016","journal-title":"Database (Oxford)"},{"key":"10.2478\/dim-2018-0004_bib006","series-title":"In LREC","first-page":"3276","article-title":"An NLP Curator (or: How I Learned to Stop Worrying and Love NLP Pipelines)","author":"Clarke","year":"2012"},{"issue":"1","key":"10.2478\/dim-2018-0004_bib007","doi-asserted-by":"crossref","first-page":"10","DOI":"10.1145\/1656274.1656278","article-title":"The WEKA data mining software: an update","volume":"11","author":"Hall","year":"2009","journal-title":"ACM SIGKDD explorations newsletter"},{"key":"10.2478\/dim-2018-0004_bib008","doi-asserted-by":"crossref","first-page":"2","DOI":"10.1007\/s13755-017-0023-z","article-title":"Progressive sampling-based Bayesian optimization for efficient and automatic machine learning model selection","volume":"5","author":"Zeng","year":"2017","journal-title":"Health Inf Sci Syst"},{"issue":"3","key":"10.2478\/dim-2018-0004_bib009","first-page":"130","volume":"14","author":"Porter","year":"1980","journal-title":"An algorithm for suffix stripping, Program"},{"key":"10.2478\/dim-2018-0004_bib010","author":"Torvik"},{"issue":"2","key":"10.2478\/dim-2018-0004_bib011","first-page":"140","article-title":"A probabilistic similarity metric for Medline records: A model for author name disambiguation","volume":"56","author":"Torvik","year":"2005","journal-title":"Journal of the Association for Information Science and Technology"},{"issue":"3","key":"10.2478\/dim-2018-0004_bib012","doi-asserted-by":"crossref","first-page":"11","DOI":"10.1145\/1552303.1552304","article-title":"Author name disambiguation in MEDLINE","volume":"3","author":"Torvik","year":"2009","journal-title":"ACM Transactions on Knowledge Discovery from Data (TKDD)"},{"issue":"3","key":"10.2478\/dim-2018-0004_bib013","doi-asserted-by":"crossref","first-page":"707","DOI":"10.1093\/jamia\/ocu025","article-title":"Automated confidence ranked classification of randomized controlled trial articles: an aid to evidence-based medicine","volume":"22","author":"Cohen","year":"2015","journal-title":"Journal of the American Medical Informatics Association"},{"issue":"12","key":"10.2478\/dim-2018-0004_bib014","article-title":"Three journal similarity metrics and their application to biomedical journals","volume":"9","author":"D'Souza","year":"2014","journal-title":"PloS one"},{"key":"10.2478\/dim-2018-0004_bib015","first-page":"7","article-title":"Two Similarity Metrics for Medical Subject Headings (MeSH):: An Aid to Biomedical Text Mining and Author Name Disambiguation","author":"Smalheiser","year":"2016","journal-title":"Journal of biomedical discovery and collaboration"},{"key":"10.2478\/dim-2018-0004_bib016","author":"Smalheiser"},{"key":"10.2478\/dim-2018-0004_bib017","first-page":"3111","article-title":"Distributed representations of words and phrases and their compositionality","author":"Mikolov","year":"2013","journal-title":"Advances in neural information processing systems"},{"issue":"6","key":"10.2478\/dim-2018-0004_bib018","doi-asserted-by":"crossref","first-page":"1166","DOI":"10.1093\/jamia\/ocw028","article-title":"Learning statistical models of phenotypes using noisy labeled training data","volume":"23","author":"Agarwal","year":"2016","journal-title":"Journal of the American Medical Informatics Association"},{"issue":"4","key":"10.2478\/dim-2018-0004_bib019","doi-asserted-by":"crossref","first-page":"189","DOI":"10.1016\/0020-0190(96)00006-3","article-title":"On the sample complexity of noise-tolerant learning","volume":"57","author":"Aslam","year":"1996","journal-title":"Information Processing Letters"},{"key":"10.2478\/dim-2018-0004_bib020","series-title":"In Proceedings of the 22nd international conference on Machine learning","first-page":"625","article-title":"Predicting good probabilities with supervised learning","author":"Niculescu-Mizil","year":"2005"},{"key":"10.2478\/dim-2018-0004_bib021","series-title":"Data clustering: algorithms and applications","author":"Aggarwal","year":"2013"},{"key":"10.2478\/dim-2018-0004_bib022","author":"Law"},{"key":"10.2478\/dim-2018-0004_bib023","series-title":"Data Mining: Practical machine learning tools and techniques","author":"Witten","year":"2016"},{"key":"10.2478\/dim-2018-0004_bib024","series-title":"In Data Mining Workshops (ICDMW), 2016 IEEE 16th International Conference on","first-page":"1007","article-title":"Context-Specific Recommendation System for Predicting Similar PubMed Articles","author":"Mohammadi","year":"2016"},{"key":"10.2478\/dim-2018-0004_bib025","doi-asserted-by":"crossref","first-page":"59","DOI":"10.1016\/j.jbi.2016.06.001","article-title":"Topic detection using paragraph vectors to support active learning in systematic reviews","volume":"62","author":"Hashimoto","year":"2016","journal-title":"Journal of biomedical informatics"},{"key":"10.2478\/dim-2018-0004_bib026","series-title":"In Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers) 2017","first-page":"763","article-title":"TextFlow: A Text Similarity Measure based on Continuous Sequences","volume":"Vol. 1","author":"Mrabet","year":"2017"},{"key":"10.2478\/dim-2018-0004_bib027","doi-asserted-by":"crossref","first-page":"612","DOI":"10.12688\/f1000research.11389.1","article-title":"PubRunner: A light-weight framework for updating text mining results","volume":"6","author":"Anekalla","year":"2017","journal-title":"F1000Res"},{"issue":"1","key":"10.2478\/dim-2018-0004_bib028","first-page":"1235","article-title":"Mllib: Machine learning in apache spark","volume":"17","author":"Meng","year":"2016","journal-title":"The Journal of Machine Learning Research"},{"key":"10.2478\/dim-2018-0004_bib029","series-title":"In Proceedings of the 21th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining","first-page":"2323","article-title":"Large scale distributed data science using apache spark","author":"Shanahan","year":"2015"},{"issue":"1","key":"10.2478\/dim-2018-0004_bib030","doi-asserted-by":"crossref","first-page":"193","DOI":"10.1093\/jamia\/ocv044","article-title":"RobotReviewer: evaluation of a system for automatically assessing bias in clinical trials","volume":"23","author":"Marshall","year":"2015","journal-title":"Journal of the American Medical Informatics Association"}],"container-title":["Data and Information Management"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/content.sciendo.com\/view\/journals\/dim\/2\/1\/article-p27.xml","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S2543925122000870?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S2543925122000870?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/www.sciendo.com\/pdf\/10.2478\/dim-2018-0004","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,11]],"date-time":"2025-11-11T15:19:04Z","timestamp":1762874344000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S2543925122000870"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018,6]]},"references-count":30,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2018,6]]}},"alternative-id":["S2543925122000870"],"URL":"https:\/\/doi.org\/10.2478\/dim-2018-0004","relation":{},"ISSN":["2543-9251"],"issn-type":[{"value":"2543-9251","type":"print"}],"subject":[],"published":{"date-parts":[[2018,6]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Design of a generic, open platform for machine learning-assisted indexing and clustering of articles in PubMed, a biomedical bibliographic database","name":"articletitle","label":"Article Title"},{"value":"Data and Information Management","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.2478\/dim-2018-0004","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"Copyright \u00a9 2018 \u00a9 2018 Neil R. Smalheiser, Aaron M. Cohe, published by Sciendo. Published by Elsevier Ltd","name":"copyright","label":"Copyright"}]}}