{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,27]],"date-time":"2025-03-27T08:36:20Z","timestamp":1743064580098,"version":"3.40.3"},"publisher-location":"Singapore","reference-count":49,"publisher":"Springer Nature Singapore","isbn-type":[{"type":"print","value":"9789819783663"},{"type":"electronic","value":"9789819783670"}],"license":[{"start":{"date-parts":[[2024,11,29]],"date-time":"2024-11-29T00:00:00Z","timestamp":1732838400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,29]],"date-time":"2024-11-29T00:00:00Z","timestamp":1732838400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-981-97-8367-0_23","type":"book-chapter","created":{"date-parts":[[2024,11,28]],"date-time":"2024-11-28T11:55:19Z","timestamp":1732794919000},"page":"387-401","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["DLUE: Benchmarking Document Language Understanding"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-8145-6453","authenticated-orcid":false,"given":"Ruoxi","family":"Xu","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0001-5857-9663","authenticated-orcid":false,"given":"Hongyu","family":"Lin","sequence":"additional","affiliation":[]},{"given":"Xinyan","family":"Guan","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0615-2569","authenticated-orcid":false,"given":"Yingfei","family":"Sun","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8750-6295","authenticated-orcid":false,"given":"Le","family":"Sun","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,11,29]]},"reference":[{"issue":"9","key":"23_CR1","doi-asserted-by":"publisher","first-page":"804","DOI":"10.1002\/(SICI)1097-4571(199709)48:9<804::AID-ASI5>3.0.CO;2-V","volume":"48","author":"MK Buckland","year":"1997","unstructured":"Buckland, M.K.: What is a document? J. Am. Soc. Inf. Sci. 48(9), 804\u2013809 (1997)","journal-title":"J. Am. Soc. Inf. Sci."},{"key":"23_CR2","doi-asserted-by":"crossref","unstructured":"Kiesel, J., et al.: Semeval-2019 task 4: hyperpartisan news detection. In: Proceedings of the 13th International Workshop on Semantic Evaluation, pp. 829\u2013839 (2019)","DOI":"10.18653\/v1\/S19-2145"},{"key":"23_CR3","doi-asserted-by":"publisher","unstructured":"Dasigi, P., Lo, K., Beltagy, I., Cohan, A., Smith, N.A., Gardner, M.: A dataset of information-seeking questions and answers anchored in research papers. In: Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, pp. 4599\u20134610. Association for Computational Linguistics, Online (2021). https:\/\/doi.org\/10.18653\/v1\/2021.naacl-main.365, https:\/\/aclanthology.org\/2021.naacl-main.365","DOI":"10.18653\/v1\/2021.naacl-main.365"},{"key":"23_CR4","doi-asserted-by":"publisher","unstructured":"Huang, L., Cao, S., Parulian, N., Ji, H., Wang, L.: Efficient attentions for long document summarization. In: Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, pp. 1419\u20131436. Association for Computational Linguistics, Online (2021). https:\/\/doi.org\/10.18653\/v1\/2021.naacl-main.112, https:\/\/aclanthology.org\/2021.naacl-main.112","DOI":"10.18653\/v1\/2021.naacl-main.112"},{"key":"23_CR5","doi-asserted-by":"publisher","first-page":"317","DOI":"10.1162\/tacl_a_00023","volume":"6","author":"T Ko\u010disk\u1ef3","year":"2018","unstructured":"Ko\u010disk\u1ef3, T., et al.: The narrativeQA reading comprehension challenge. Trans. Assoc. Comput. Linguist. 6, 317\u2013328 (2018)","journal-title":"Trans. Assoc. Comput. Linguist."},{"key":"23_CR6","unstructured":"Shanahan, T., Fisher, D., Frey, N.: The challenge of challenging text. On developing readers: readings from educational leadership (EL Essentials) 100 (2016)"},{"key":"23_CR7","unstructured":"Wang, A., et al.: SuperGLUE: a stickier benchmark for general-purpose language understanding systems. In: Advances in Neural Information Processing Systems, vol. 32 (2019)"},{"issue":"1","key":"23_CR8","doi-asserted-by":"publisher","first-page":"201","DOI":"10.1162\/COLI_a_00278","volume":"43","author":"F Benamara","year":"2017","unstructured":"Benamara, F., Taboada, M., Mathieu, Y.: Evaluative language beyond bags of words: linguistic insights and computational applications. Comput. Linguist. 43(1), 201\u2013264 (2017)","journal-title":"Comput. Linguist."},{"key":"23_CR9","unstructured":"Parsing, C.: Speech and language processing (2009)"},{"key":"23_CR10","doi-asserted-by":"crossref","unstructured":"Huang, K.H., Tang, S., Peng, N.: Document-level entity-based extraction as template generation. arXiv preprint arXiv:2109.04901 (2021)","DOI":"10.18653\/v1\/2021.emnlp-main.426"},{"key":"23_CR11","unstructured":"Tay, Y., et al.: Long range arena: a benchmark for efficient transformers. In: International Conference on Learning Representations (2021). https:\/\/openreview.net\/forum?id=qVyeW-grC2k"},{"key":"23_CR12","unstructured":"Hudson, G.T., Moubayed, N.A.: MuLD: the multitask long document benchmark. arXiv preprint arXiv:2202.07362 (2022)"},{"key":"23_CR13","doi-asserted-by":"crossref","unstructured":"Shaham, U., et\u00a0al.: SCROLLs: standardized comparison over long language sequences. arXiv preprint arXiv:2201.03533 (2022)","DOI":"10.18653\/v1\/2022.emnlp-main.823"},{"key":"23_CR14","doi-asserted-by":"crossref","unstructured":"Shaham, U., Ivgi, M., Efrat, A., Berant, J., Levy, O.: ZeroSCROLLs: a zero-shot benchmark for long text understanding. In: Findings of the Association for Computational Linguistics: EMNLP 2023, pp. 7977\u20137989 (2023)","DOI":"10.18653\/v1\/2023.findings-emnlp.536"},{"key":"23_CR15","unstructured":"Zhang, X., Zhao, J., LeCun, Y.: Character-level convolutional networks for text classification. In; Advances in Neural Information Processing Systems, vol. 28 (2015)"},{"issue":"3","key":"23_CR16","doi-asserted-by":"publisher","first-page":"581","DOI":"10.1007\/s10579-016-9343-x","volume":"51","author":"A Zeldes","year":"2017","unstructured":"Zeldes, A.: The gum corpus: creating multilayer resources in the classroom. Lang. Resour. Eval. 51(3), 581\u2013612 (2017). https:\/\/doi.org\/10.1007\/s10579-016-9343-x","journal-title":"Lang. Resour. Eval."},{"key":"23_CR17","doi-asserted-by":"crossref","unstructured":"Cheng, L., Bing, L., Yu, Q., Lu, W., Si, L.: APE: argument pair extraction from peer review and rebuttal via multi-task learning. In: Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP), pp. 7000\u20137011 (2020)","DOI":"10.18653\/v1\/2020.emnlp-main.569"},{"key":"23_CR18","unstructured":"Bamman, D., Lewke, O., Mansoor, A.: An annotated dataset of coreference in English literature. In: Proceedings of the Twelfth Language Resources and Evaluation Conference, pp. 44\u201354. European Language Resources Association, Marseille, France (2020). https:\/\/aclanthology.org\/2020.lrec-1.6"},{"key":"23_CR19","doi-asserted-by":"publisher","unstructured":"Chen, M., Chu, Z., Wiseman, S., Gimpel, K.: SummScreen: A dataset for abstractive screenplay summarization. In: Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pp. 8602\u20138615. Association for Computational Linguistics, Dublin, Ireland (2022). https:\/\/doi.org\/10.18653\/v1\/2022.acl-long.589","DOI":"10.18653\/v1\/2022.acl-long.589"},{"key":"23_CR20","unstructured":"Yang, Z., Dai, Z., Yang, Y., Carbonell, J., Salakhutdinov, R.R., Le, Q.V.: XLNet: generalized autoregressive pretraining for language understanding. In: Advances in Neural Information Processing Systems, vol. 32 (2019)"},{"key":"23_CR21","unstructured":"Beltagy, I., Peters, M.E., Cohan, A.: Longformer: the long-document transformer. arXiv preprint arXiv:2004.05150 (2020)"},{"key":"23_CR22","first-page":"17283","volume":"33","author":"M Zaheer","year":"2020","unstructured":"Zaheer, M., et al.: Big Bird: transformers for longer sequences. Adv. Neural. Inf. Process. Syst. 33, 17283\u201317297 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"23_CR23","unstructured":"Child, R., Gray, S., Radford, A., Sutskever, I.: Generating long sequences with sparse transformers. arXiv preprint arXiv:1904.10509 (2019)"},{"key":"23_CR24","unstructured":"Wang, S., Li, B.Z., Khabsa, M., Fang, H., Ma, H.: Linformer: self-attention with linear complexity. arXiv preprint arXiv:2006.04768 (2020)"},{"key":"23_CR25","unstructured":"Choromanski, K., et\u00a0al.: Rethinking attention with performers. arXiv preprint arXiv:2009.14794 (2020)"},{"key":"23_CR26","unstructured":"Chiang, W.L., et\u00a0al.: Vicuna: An open-source chatbot impressing GPT-4 with 90%* ChatGPT quality. See https:\/\/vicuna.lmsys.org (2023). Accessed 14 Apr 2023"},{"key":"23_CR27","unstructured":"Touvron, H., et\u00a0al.: LLaMA 2: open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288 (2023)"},{"key":"23_CR28","unstructured":"Jiang, A.Q., et al.: Mistral 7B (2023)"},{"key":"23_CR29","doi-asserted-by":"crossref","unstructured":"Du, Z., et al.: GLM: general language model pretraining with autoregressive blank infilling. In: Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pp. 320\u2013335 (2022)","DOI":"10.18653\/v1\/2022.acl-long.26"},{"key":"23_CR30","unstructured":"Conneau, A., Kiela, D.: SentEval: an evaluation toolkit for universal sentence representations. arXiv preprint arXiv:1803.05449 (2018)"},{"key":"23_CR31","unstructured":"McCann, B., Keskar, N.S., Xiong, C., Socher, R.: The natural language decathlon: multitask learning as question answering. arXiv preprint arXiv:1806.08730 (2018)"},{"key":"23_CR32","doi-asserted-by":"crossref","unstructured":"Wang, A., Singh, A., Michael, J., Hill, F., Levy, O., Bowman, S.R.: GLUE: a multi-task benchmark and analysis platform for natural language understanding. arXiv preprint arXiv:1804.07461 (2018)","DOI":"10.18653\/v1\/W18-5446"},{"key":"23_CR33","unstructured":"Mehri, S., Eric, M., Hakkani-Tur, D.: DialoGLUE: a natural language understanding benchmark for task-oriented dialogue. arXiv preprint arXiv:2009.13570 (2020)"},{"key":"23_CR34","doi-asserted-by":"crossref","unstructured":"Chen, M., Chu, Z., Gimpel, K.: Evaluation benchmarks and learning criteria for discourse-aware sentence representations. arXiv preprint arXiv:1909.00142 (2019)","DOI":"10.18653\/v1\/D19-1060"},{"key":"23_CR35","doi-asserted-by":"crossref","unstructured":"Khanuja, S., Dandapat, S., Srinivasan, A., Sitaram, S., Choudhury, M.: GLUECoS: an evaluation benchmark for code-switched NLP. arXiv preprint arXiv:2004.12376 (2020)","DOI":"10.18653\/v1\/2020.acl-main.329"},{"key":"23_CR36","doi-asserted-by":"crossref","unstructured":"Petroni, F., et\u00a0al.: KILT: a benchmark for knowledge intensive language tasks. arXiv preprint arXiv:2009.02252 (2020)","DOI":"10.18653\/v1\/2021.naacl-main.200"},{"key":"23_CR37","doi-asserted-by":"crossref","unstructured":"Cohan, A., et al.: A discourse-aware attention model for abstractive summarization of long documents. arXiv preprint arXiv:1804.05685 (2018)","DOI":"10.18653\/v1\/N18-2097"},{"key":"23_CR38","unstructured":"Tay, Y., et al.: Long range arena: a benchmark for efficient transformers. arXiv preprint arXiv:2011.04006 (2020)"},{"key":"23_CR39","doi-asserted-by":"publisher","first-page":"434","DOI":"10.1162\/tacl_a_00469","volume":"10","author":"J Guan","year":"2022","unstructured":"Guan, J., et al.: LOT: a story-centric benchmark for evaluating Chinese long text understanding and generation. Trans. Assoc. Comput. Linguist. 10, 434\u2013451 (2022)","journal-title":"Trans. Assoc. Comput. Linguist."},{"key":"23_CR40","unstructured":"Kitaev, N., Kaiser, \u0141., Levskaya, A.: Reformer: the efficient transformer. arXiv preprint arXiv:2001.04451 (2020)"},{"key":"23_CR41","doi-asserted-by":"crossref","unstructured":"Koreeda, Y., Manning, C.D.: ContractNLI: a dataset for document-level natural language inference for contracts. arXiv preprint arXiv:2110.01799 (2021)","DOI":"10.18653\/v1\/2021.findings-emnlp.164"},{"key":"23_CR42","doi-asserted-by":"crossref","unstructured":"Xu, R., et al.: ECO v1: towards event-centric opinion mining. arXiv preprint arXiv:2203.12264 (2022)","DOI":"10.18653\/v1\/2022.findings-acl.216"},{"key":"23_CR43","doi-asserted-by":"publisher","unstructured":"Cohan, A., et al.: A discourse-aware attention model for abstractive summarization of long documents. In: Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 2 (Short Papers), pp. 615\u2013621. Association for Computational Linguistics, New Orleans, Louisiana (2018). https:\/\/doi.org\/10.18653\/v1\/N18-2097, https:\/\/aclanthology.org\/N18-2097","DOI":"10.18653\/v1\/N18-2097"},{"key":"23_CR44","unstructured":"Tay, Y., Dehghani, M., Bahri, D., Metzler, D.: Efficient transformers: a survey. ACM Comput. Surv. (CSUR) (2020)"},{"key":"23_CR45","doi-asserted-by":"publisher","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. In: Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers), pp. 4171\u20134186. Association for Computational Linguistics, Minneapolis, Minnesota (2019). https:\/\/doi.org\/10.18653\/v1\/N19-1423, https:\/\/aclanthology.org\/N19-1423","DOI":"10.18653\/v1\/N19-1423"},{"key":"23_CR46","doi-asserted-by":"crossref","unstructured":"Hu, M., Peng, Y., Huang, Z., Li, D.: A multi-type multi-span network for reading comprehension that requires discrete reasoning. arXiv preprint arXiv:1908.05514 (2019)","DOI":"10.18653\/v1\/D19-1170"},{"key":"23_CR47","unstructured":"Vaswani, A., et al.: Attention is all you need. In: Advances in Neural Information Processing Systems, vol. 30 (2017)"},{"key":"23_CR48","first-page":"12792","volume":"33","author":"M Ding","year":"2020","unstructured":"Ding, M., Zhou, C., Yang, H., Tang, J.: CogLTX: applying BERT to long texts. Adv. Neural. Inf. Process. Syst. 33, 12792\u201312804 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"23_CR49","doi-asserted-by":"crossref","unstructured":"Pappagari, R., Zelasko, P., Villalba, J., Carmiel, Y., Dehak, N.: Hierarchical transformers for long document classification. In: 2019 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU), pp. 838\u2013844. IEEE (2019)","DOI":"10.1109\/ASRU46091.2019.9003958"}],"container-title":["Lecture Notes in Computer Science","Chinese Computational Linguistics"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-97-8367-0_23","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,28]],"date-time":"2024-11-28T12:08:01Z","timestamp":1732795681000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-97-8367-0_23"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,29]]},"ISBN":["9789819783663","9789819783670"],"references-count":49,"URL":"https:\/\/doi.org\/10.1007\/978-981-97-8367-0_23","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,11,29]]},"assertion":[{"value":"29 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"In consideration of ethical concerns, we provide the following detailed description:1. We believe that this work is beneficial to develop universal document understanding architectures, which can help people quickly get information from business documents, legal statements and so on, saving time and money.2. We standardize and put together ten datasets, which are all already publicly available under CC-BY-(NC-)SA-4.0 licenses (.). For all the datasets, we have referenced the original work and encouraged DLUE users to do so.3. All DLUE benchmark datasets have low ethical risks and do not expose any sensitive or personally identifiable information.","order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethics Statement"}},{"value":"CCL","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China National Conference on Chinese Computational Linguistics","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Taiyuan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"25 July 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"28 July 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"23","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"cncl2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/cips-cl.org\/static\/CCL2024\/en\/index.html","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}