{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,20]],"date-time":"2026-04-20T13:53:23Z","timestamp":1776693203024,"version":"3.51.2"},"reference-count":60,"publisher":"Springer Science and Business Media LLC","issue":"4","license":[{"start":{"date-parts":[[2023,4,15]],"date-time":"2023-04-15T00:00:00Z","timestamp":1681516800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,4,15]],"date-time":"2023-04-15T00:00:00Z","timestamp":1681516800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/100000208","name":"Institute of Museum and Library Services","doi-asserted-by":"publisher","award":["LG-86-18-0061-18"],"award-info":[{"award-number":["LG-86-18-0061-18"]}],"id":[{"id":"10.13039\/100000208","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Digit Libr"],"published-print":{"date-parts":[[2023,12]]},"DOI":"10.1007\/s00799-023-00354-5","type":"journal-article","created":{"date-parts":[[2023,4,15]],"date-time":"2023-04-15T17:02:21Z","timestamp":1681578141000},"page":"311-325","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["Approximate nearest neighbor for long document relationship labeling in digital libraries"],"prefix":"10.1007","volume":"24","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-9058-2280","authenticated-orcid":false,"given":"Peter","family":"Organisciak","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Benjamin M.","family":"Schmidt","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1131-8132","authenticated-orcid":false,"given":"Matthew","family":"Durward","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2023,4,15]]},"reference":[{"key":"354_CR1","doi-asserted-by":"publisher","first-page":"106","DOI":"10.1007\/978-3-642-03761-0_12","volume-title":"Advances in Focused Retrieval","author":"G Kazai","year":"2009","unstructured":"Kazai, G., Doucet, A., Landoni, M.: Overview of the INEX 2008 Book Track. In: Geva, S., Kamps, J., Trotman, A. (eds.) Advances in Focused Retrieval, pp. 106\u2013123. Springer, Berlin, Heidelberg (2009)"},{"key":"354_CR2","doi-asserted-by":"crossref","unstructured":"Cummins, R.: A study of retrieval models for long documents and queries in information retrieval. In: Proceedings of the 25th International Conference on World Wide Web. International World Wide Web Conferences Steering Committee, Republic and Canton of Geneva, CHE, pp. 795\u2013805 (2016)","DOI":"10.1145\/2872427.2883009"},{"key":"354_CR3","doi-asserted-by":"crossref","unstructured":"Indyk, P., Motwani, R.: Approximate nearest neighbors: towards removing the curse of dimensionality. In: Proceedings of the thirtieth annual ACM symposium on Theory of computing. Association for Computing Machinery, Dallas, Texas, USA, pp. 604\u2013613 (1998)","DOI":"10.1145\/276698.276876"},{"key":"354_CR4","doi-asserted-by":"crossref","unstructured":"Charikar, MS.: Similarity estimation techniques from rounding algorithms. In: Proceedings of the thiry-fourth annual ACM symposium on Theory of computing. Association for Computing Machinery, Montreal, Quebec, Canada, pp. 380\u2013388 (2002)","DOI":"10.1145\/509907.509965"},{"key":"354_CR5","doi-asserted-by":"crossref","unstructured":"Henzinger, M.: Finding near-duplicate web pages: a large-scale evaluation of algorithms. In: Proceedings of the 29th annual international ACM SIGIR conference on Research and development in information retrieval. Association for Computing Machinery, Seattle, Washington, USA, pp. 284\u2013291 (2006)","DOI":"10.1145\/1148170.1148222"},{"key":"354_CR6","unstructured":"HathiTrust.: About. In: HathiTrust Digit. Libr. https:\/\/www.hathitrust.org\/about (2022). Accessed 12 Nov 2017"},{"key":"354_CR7","first-page":"50","volume":"51","author":"JS Downie","year":"2016","unstructured":"Downie, J.S., Furlough, M., McDonald, R.H., et al.: The hathitrust research center: exploring the full-text frontier. Educ. Rev. 51, 50\u201351 (2016)","journal-title":"Educ. Rev."},{"key":"354_CR8","doi-asserted-by":"publisher","first-page":"176","DOI":"10.1126\/science.1199644","volume":"331","author":"J-B Michel","year":"2011","unstructured":"Michel, J.-B., Shen, Y.K., Aiden, A.P., et al.: Quantitative analysis of culture using millions of digitized books. Science 331, 176\u2013182 (2011)","journal-title":"Science"},{"key":"354_CR9","doi-asserted-by":"crossref","unstructured":"Schreibman, S.: Non-consumptive reading. In: From Literature to Cultural Literacy. Springer, pp. 148\u2013165 (2014)","DOI":"10.1057\/9781137429704_11"},{"key":"354_CR10","unstructured":"York, J.: Building a future by preserving our past: the preservation infrastructure of HathiTrust digital library. In: World Library and Information Congress: 76th IFLA General Conference and Assembly. pp. 10\u201315 (2010)"},{"key":"354_CR11","doi-asserted-by":"publisher","first-page":"145","DOI":"10.1007\/978-3-642-14556-8_16","volume-title":"Focused Retrieval and Evaluation","author":"G Kazai","year":"2010","unstructured":"Kazai, G., Doucet, A., Koolen, M., Landoni, M.: Overview of the INEX 2009 Book Track. In: Geva, S., Kamps, J., Trotman, A. (eds.) Focused Retrieval and Evaluation, pp. 145\u2013159. Springer, Berlin, Heidelberg (2010)"},{"key":"354_CR12","doi-asserted-by":"publisher","first-page":"98","DOI":"10.1007\/978-3-642-23577-1_9","volume-title":"Comparative Evaluation of Focused Retrieval","author":"G Kazai","year":"2011","unstructured":"Kazai, G., Koolen, M., Kamps, J., et al.: Overview of the INEX 2010 Book Track: Scaling Up the Evaluation Using Crowdsourcing. In: Geva, S., Kamps, J., Schenkel, R., Trotman, A. (eds.) Comparative Evaluation of Focused Retrieval, pp. 98\u2013117. Springer, Berlin, Heidelberg (2011)"},{"key":"354_CR13","doi-asserted-by":"crossref","unstructured":"Salton, G., Allan, J., Buckley, C.: Approaches to passage retrieval in full text information systems. In: Proceedings of the 16th annual international ACM SIGIR conference on Research and development in information retrieval. Association for Computing Machinery, New York, NY, USA, pp. 49\u201358 (1993)","DOI":"10.1145\/160688.160693"},{"key":"354_CR14","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter, S., Schmidhuber, J.: Long short-term memory. Neural. Comput. 9, 1735\u20131780 (1997)","journal-title":"Neural. Comput."},{"key":"354_CR15","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., et al.: Attention Is All You Need. ArXiv1706.03762 (2017)"},{"key":"354_CR16","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: BERT: Pre-training of deep bidirectional transformers for language understanding (2018)"},{"key":"354_CR17","first-page":"1","volume":"21","author":"C Raffel","year":"2020","unstructured":"Raffel, C., Shazeer, N., Roberts, A., et al.: Exploring the limits of transfer learning with a unified text-to-text transformer. J Mach Learn Res 21, 1\u201367 (2020)","journal-title":"J Mach Learn Res"},{"key":"354_CR18","unstructured":"Beltagy, I., Peters, M.E., Cohan, A.: Longformer: the long-document transformer. ArXiv200405150 Cs (2020)"},{"key":"354_CR19","unstructured":"Sutton, R.: The Bitter Lesson (2019). http:\/\/www.incompleteideas.net\/IncIdeas\/BitterLesson.html"},{"key":"354_CR20","unstructured":"Liu, Y., Ott, M., Goyal, N., et al.: RoBERTa: A Robustly Optimized BERT Pretraining Approach. (2019). ArXiv190711692 Cs"},{"key":"354_CR21","unstructured":"Kaplan, J., McCandlish, S., Henighan, T., et al.: Scaling laws for neural language models (2020)"},{"key":"354_CR22","doi-asserted-by":"crossref","unstructured":"Strubell, E., Ganesh, A., McCallum, A.: Energy and policy considerations for deep learning in NLP. (2019). ArXiv190602243 Cs","DOI":"10.18653\/v1\/P19-1355"},{"key":"354_CR23","doi-asserted-by":"publisher","first-page":"54","DOI":"10.1145\/3381831","volume":"63","author":"R Schwartz","year":"2020","unstructured":"Schwartz, R., Dodge, J., Smith, N.A., Etzioni, O.: Green AI. Commun ACM 63, 54\u201363 (2020). https:\/\/doi.org\/10.1145\/3381831","journal-title":"Commun ACM"},{"key":"354_CR24","unstructured":"Han, S., Mao, H., Dally, WJ.: Deep compression: compressing deep neural networks with pruning, trained quantization and huffman coding. (2016). ArXiv151000149 Cs"},{"key":"354_CR25","doi-asserted-by":"crossref","unstructured":"Jiang, J.Y., Zhang, M., Li, C., et al.: Semantic text matching for long-form documents. In: The World Wide Web Conference. Association for Computing Machinery, New York, NY, USA, pp. 795\u2013806 (2019)","DOI":"10.1145\/3308558.3313707"},{"key":"354_CR26","unstructured":"Mikolov, T., Sutskever, I., Chen, K., et al.: Distributed representations of words and phrases and their compositionality. In: Burges CJC, Bottou L, Welling M, et al (eds) Advances in Neural Information Processing Systems 26. Curran Associates, Inc., pp. 3111\u20133119 (2013)"},{"key":"354_CR27","doi-asserted-by":"crossref","unstructured":"Pennington, J., Socher, R., Manning, C.: Glove: Global vectors for word representation. In: Proceedings of the 2014 conference on empirical methods in natural language processing (EMNLP). pp. 1532\u20131543 (2014)","DOI":"10.3115\/v1\/D14-1162"},{"key":"354_CR28","unstructured":"McCann, B., Bradbury, J., Xiong, C., Socher, R.: Learned in translation: Contextualized word vectors. In: Advances in Neural Information Processing Systems. pp. 6294\u20136305 (2017)"},{"key":"354_CR29","doi-asserted-by":"crossref","unstructured":"Cer, D., Yang, Y., Kong, S., et al.: Universal Sentence Encoder. (2018). ArXiv180311175 Cs","DOI":"10.18653\/v1\/D18-2029"},{"key":"354_CR30","doi-asserted-by":"crossref","unstructured":"Reimers, N., Gurevych, I.: Sentence-BERT: Sentence embeddings using siamese BERT-networks (2019)","DOI":"10.18653\/v1\/D19-1410"},{"key":"354_CR31","doi-asserted-by":"crossref","unstructured":"Ni, J., \u00c1brego, G.H., Constant, N., et al.: Sentence-T5: scalable sentence encoders from pre-trained text-to-text models. (2021)","DOI":"10.18653\/v1\/2022.findings-acl.146"},{"key":"354_CR32","unstructured":"HathiTrust Collections Committee.: HathiTrust monographic duplication and uniqueness: 2017 report and recommendations from the HathiTrust collections committee. (2017)"},{"key":"354_CR33","doi-asserted-by":"publisher","first-page":"1157","DOI":"10.1016\/S0169-7552(97)00031-7","volume":"29","author":"AZ Broder","year":"1997","unstructured":"Broder, A.Z., Glassman, S.C., Manasse, M.S., Zweig, G.: Syntactic clustering of the web. Comput Netw ISDN Syst 29, 1157\u20131166 (1997)","journal-title":"Comput Netw ISDN Syst"},{"key":"354_CR34","doi-asserted-by":"publisher","first-page":"824","DOI":"10.1109\/TPAMI.2018.2889473","volume":"42","author":"YA Malkov","year":"2020","unstructured":"Malkov, Y.A., Yashunin, D.A.: Efficient and robust approximate nearest neighbor search using hierarchical navigable small world graphs. IEEE Trans Pattern Anal Mach Intell 42, 824\u2013836 (2020)","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"354_CR35","doi-asserted-by":"crossref","unstructured":"Aum\u00fcller, M., Bernhardsson, E., Faithfull, A.: ANN-Benchmarks: A Benchmarking Tool for Approximate Nearest Neighbor Algorithms. (2018). ArXiv:180705614 Cs","DOI":"10.1007\/978-3-319-68474-1_3"},{"key":"354_CR36","unstructured":"Bernhardsson, E. (2013) Annoy [C++]. Spotify. https:\/\/github.com\/spotify\/annoy"},{"key":"354_CR37","unstructured":"Korn, F., Sidiropoulos, N., Faloutsos, C., et al.: Fast nearest neighbor search in medical image databases. (1998)"},{"key":"354_CR38","doi-asserted-by":"publisher","first-page":"3463","DOI":"10.1021\/acs.jproteome.8b00359","volume":"17","author":"W Bittremieux","year":"2018","unstructured":"Bittremieux, W., Meysman, P., Noble, W.S., Laukens, K.: Fast open modification spectral library searching through approximate nearest neighbor indexing. J Proteome Res 17, 3463\u20133474 (2018)","journal-title":"J Proteome Res"},{"key":"354_CR39","doi-asserted-by":"crossref","unstructured":"Dershowitz, N., Labenski, D., Silberpfennig, A., et al.: Relating articles textually and visually. In: 2017 14th IAPR International Conference on Document Analysis and Recognition (ICDAR). IEEE, Kyoto, pp. 274\u2013280 (2017)","DOI":"10.1109\/ICDAR.2017.53"},{"key":"354_CR40","doi-asserted-by":"crossref","unstructured":"Sankar, K.P., Jawahar, C.V., Manmatha, R.: Nearest neighbor based collection OCR. In: Proceedings of the 8th IAPR International Workshop on Document Analysis Systems - DAS \u201910. ACM Press, Boston, Massachusetts, pp. 207\u2013214 (2010)","DOI":"10.1145\/1815330.1815357"},{"key":"354_CR41","doi-asserted-by":"crossref","unstructured":"Williams, K., Giles, CL.: Near duplicate detection in an academic digital library. In: Proceedings of the 2013 ACM symposium on Document engineering. Association for Computing Machinery, Florence, Italy, pp. 91\u201394 (2013)","DOI":"10.1145\/2494266.2494312"},{"key":"354_CR42","doi-asserted-by":"crossref","unstructured":"Schmidt, B.: Stable random projection: lightweight, general-purpose dimensionality reduction for digitized libraries. J Cult Anal. (2018)","DOI":"10.31235\/osf.io\/36neu"},{"key":"354_CR43","doi-asserted-by":"publisher","first-page":"1624","DOI":"10.1002\/asi.23482","volume":"67","author":"A Gonzalez-Agirre","year":"2016","unstructured":"Gonzalez-Agirre, A., Rigau, G., Agirre, E., et al.: Why are these similar? Investigating item similarity types in a large digital library. J Assoc Inf Sci Technol 67, 1624\u20131638 (2016)","journal-title":"J Assoc Inf Sci Technol"},{"key":"354_CR44","unstructured":"Organisciak, P., Capitanu, B., Underwood, T., Downie JS.: Access to billions of pages for large-scale text analysis (2017)"},{"key":"354_CR45","unstructured":"Lui, M., Baldwin, T.: langid.py: an off-the-shelf language identification tool. In: Proceedings of the ACL 2012 System Demonstrations. Association for Computational Linguistics, Jeju Island, Korea, pp. 25\u201330 (2012)"},{"key":"354_CR46","doi-asserted-by":"crossref","unstructured":"Organisciak, P., Shetenhelm, S., Vasques, D.F.A., Matusiak, K.: Characterizing Same Work Relationships in Large-Scale Digital Libraries. In: International Conference on Information. Springer, pp. 419\u2013425 (2019)","DOI":"10.1007\/978-3-030-15742-5_40"},{"key":"354_CR47","doi-asserted-by":"crossref","unstructured":"IFLA Study Group on the Functional Requirements for Bibliographic Records.: Functional Requirements for Bibliographic Records. IFLA, Munich (1998)","DOI":"10.1515\/9783110962451"},{"key":"354_CR48","unstructured":"JSC for Development of RDA.: RDA: Resource description and access: 2013 Revision (2013)"},{"key":"354_CR49","doi-asserted-by":"crossref","unstructured":"Das, A.S., Datar, M., Garg, A., Rajaram, S.: Google news personalization: scalable online collaborative filtering. In: Proceedings of the 16th International Conference on World Wide Web. Association for Computing Machinery, Banff, Alberta, Canada, pp. 271\u2013280 (2007)","DOI":"10.1145\/1242572.1242610"},{"key":"354_CR50","doi-asserted-by":"crossref","unstructured":"Liu, J., Jin, T., Pan, K., et al.: An improved KNN text classification algorithm based on Simhash. In: 2017 IEEE 16th International Conference on Cognitive Informatics Cognitive Computing (ICCI*CC). pp. 92\u201395 (2017)","DOI":"10.1109\/ICCI-CC.2017.8109735"},{"key":"354_CR51","doi-asserted-by":"publisher","first-page":"391","DOI":"10.1002\/(SICI)1097-4571(199009)41:6<391::AID-ASI1>3.0.CO;2-9","volume":"41","author":"S Deerwester","year":"1990","unstructured":"Deerwester, S., Dumais, S.T., Furnas, G.W., et al.: Indexing by latent semantic analysis. J Am Soc Inf Sci 41, 391\u2013407 (1990). https:\/\/doi.org\/10.1002\/(SICI)1097-4571(199009)41:6%3c391::AID-ASI1%3e3.0.CO;2-9","journal-title":"J Am Soc Inf Sci"},{"key":"354_CR52","doi-asserted-by":"publisher","unstructured":"Peters, M. E., Neumann, M., Iyyer, M., Gardner, M., Clark, C., Lee, K., & Zettlemoyer, L.: Deep Contextualized Word Representations. Proceedings of the 2018 Conference of the NorthAmerican Chapter of the Associationfor Computational Linguistics (ACL). pp. 2227\u20132237. (2018). https:\/\/doi.org\/10.18653\/v1\/N18-1202.","DOI":"10.18653\/v1\/N18-1202"},{"key":"354_CR53","doi-asserted-by":"crossref","unstructured":"Ethayarajh, K.: How Contextual are Contextualized Word Representations? Comparing the Geometry of BERT, ELMo, and GPT-2 Embeddings. (2019). ArXiv190900512 Cs","DOI":"10.18653\/v1\/D19-1006"},{"key":"354_CR54","doi-asserted-by":"crossref","unstructured":"Li, B., Zhou, H., He, J., et al.: On the sentence embeddings from BERT for semantic textual similarity. In: Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP). pp. 9119\u20139130 (2020)","DOI":"10.18653\/v1\/2020.emnlp-main.733"},{"key":"354_CR55","doi-asserted-by":"publisher","first-page":"129","DOI":"10.1002\/asi.4630270302","volume":"27","author":"SE Robertson","year":"1976","unstructured":"Robertson, S.E., Sp\u00e4rck Jones, K.: Relevance weighting of search terms. J Am Soc Inf Sci 27, 129\u2013146 (1976). https:\/\/doi.org\/10.1002\/asi.4630270302","journal-title":"J Am Soc Inf Sci"},{"key":"354_CR56","doi-asserted-by":"publisher","first-page":"613","DOI":"10.1145\/361219.361220","volume":"18","author":"G Salton","year":"1975","unstructured":"Salton, G., Wong, A., Yang, C.S.: A vector space model for automatic indexing. Commun ACM 18, 613\u2013620 (1975). https:\/\/doi.org\/10.1145\/361219.361220","journal-title":"Commun ACM"},{"key":"354_CR57","unstructured":"Bia\u0142ecki, A., Muir, R., Ingersoll, G.: Apache lucene 4. In: Proceedings of the SIGIR 2012 Workshop on Open Source Information Retrieval. Portland, Oregon, p. 17 (2012)"},{"key":"354_CR58","unstructured":"Banon,\nS. (2022). Elasticsearch. Elastic. https:\/\/www.elastic.co\/elasticsearch"},{"key":"354_CR59","doi-asserted-by":"publisher","first-page":"135","DOI":"10.1162\/tacl_a_00051","volume":"5","author":"P Bojanowski","year":"2017","unstructured":"Bojanowski, P., Grave, E., Joulin, A., Mikolov, T.: Enriching word vectors with subword information. Trans Assoc Comput Linguist 5, 135\u2013146 (2017)","journal-title":"Trans Assoc Comput Linguist"},{"key":"354_CR60","unstructured":"Heinzerling, B., Strube, M.: BPEmb: Tokenization-free Pre-trained Subword Embeddings in 275 Languages. In: chair) NC (Conference, Choukri K, Cieri C, et al (eds) Proceedings of the Eleventh International Conference on Language Resources and Evaluation (LREC 2018). European Language Resources Association (ELRA), Miyazaki, Japan (2018)"}],"container-title":["International Journal on Digital Libraries"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00799-023-00354-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00799-023-00354-5\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00799-023-00354-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,12,8]],"date-time":"2023-12-08T11:05:18Z","timestamp":1702033518000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00799-023-00354-5"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,4,15]]},"references-count":60,"journal-issue":{"issue":"4","published-print":{"date-parts":[[2023,12]]}},"alternative-id":["354"],"URL":"https:\/\/doi.org\/10.1007\/s00799-023-00354-5","relation":{},"ISSN":["1432-5012","1432-1300"],"issn-type":[{"value":"1432-5012","type":"print"},{"value":"1432-1300","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023,4,15]]},"assertion":[{"value":"4 May 2022","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"24 February 2023","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"1 March 2023","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"15 April 2023","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}