{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,27]],"date-time":"2026-05-27T14:37:57Z","timestamp":1779892677281,"version":"3.53.1"},"reference-count":26,"publisher":"Springer Science and Business Media LLC","issue":"7","license":[{"start":{"date-parts":[[2024,3,18]],"date-time":"2024-03-18T00:00:00Z","timestamp":1710720000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,3,18]],"date-time":"2024-03-18T00:00:00Z","timestamp":1710720000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61977003"],"award-info":[{"award-number":["61977003"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"New Liberal Arts Research and Reform Practice Projects of the Ministry of Education of China","award":["2021180002"],"award-info":[{"award-number":["2021180002"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Mach Learn"],"published-print":{"date-parts":[[2024,7]]},"DOI":"10.1007\/s10994-023-06512-9","type":"journal-article","created":{"date-parts":[[2024,3,18]],"date-time":"2024-03-18T16:01:28Z","timestamp":1710777688000},"page":"3999-4012","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["TOCOL: improving contextual representation of pre-trained language models via token-level contrastive learning"],"prefix":"10.1007","volume":"113","author":[{"given":"Keheng","family":"Wang","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0742-0804","authenticated-orcid":false,"given":"Chuantao","family":"Yin","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Rumei","family":"Li","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Sirui","family":"Wang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yunsen","family":"Xian","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Wenge","family":"Rong","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Zhang","family":"Xiong","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2024,3,18]]},"reference":[{"key":"6512_CR1","unstructured":"Bahdanau, D., Cho, K., & Bengio, Y. (2014). Neural machine translation by jointly learning to align and translate. arXiv preprint arXiv:1409.0473."},{"key":"6512_CR2","unstructured":"Bentivogli, L., Clark, P., Dagan, I., & Giampiccolo, D. (2009). The fifth pascal recognizing textual entailment challenge. In: TAC."},{"key":"6512_CR3","doi-asserted-by":"publisher","unstructured":"Cer, D., Diab, M., Agirre, E., Lopez-Gazpio, I., & Specia, L. (2017). SemEval-2017 task 1: Semantic textual similarity multilingual and crosslingual focused evaluation. In: Proceedings of the 11th international workshop on semantic evaluation (SemEval-2017). pp. 1\u201314. Association for Computational Linguistics, Vancouver, Canada. https:\/\/doi.org\/10.18653\/v1\/S17-2001, https:\/\/aclanthology.org\/S17-2001","DOI":"10.18653\/v1\/S17-2001"},{"key":"6512_CR4","unstructured":"Chen, T., Kornblith, S., Norouzi, M., & Hinton, G. (2020). A simple framework for contrastive learning of visual representations. In: International conference on machine learning. pp. 1597\u20131607. PMLR."},{"key":"6512_CR5","unstructured":"Chen, Z., Zhang, H., Zhang, X., & Zhao, L. (2018). Quora question pairs."},{"key":"6512_CR6","doi-asserted-by":"publisher","unstructured":"Devlin, J., Chang, M.W., Lee, K., & Toutanova, K. (2019). BERT: Pre-training of deep bidirectional transformers for language understanding. In: Proceedings of the 2019 conference of the North American chapter of the association for computational linguistics: Human language technologies, volume 1 (Long and Short Papers). pp. 4171\u20134186. Association for Computational Linguistics, Minneapolis, Minnesota. https:\/\/doi.org\/10.18653\/v1\/N19-1423, https:\/\/aclanthology.org\/N19-1423","DOI":"10.18653\/v1\/N19-1423"},{"key":"6512_CR7","unstructured":"Dolan, W.B., & Brockett, C. (2005). Automatically constructing a corpus of sentential paraphrases. In: Proceedings of the third international workshop on paraphrasing (IWP2005). https:\/\/aclanthology.org\/I05-5002"},{"key":"6512_CR8","doi-asserted-by":"publisher","unstructured":"Fu, Z., Zhou, W., Xu, J., Zhou, H., & Li, L. (2022). Contextual representation learning beyond masked language modeling. In: Proceedings of the 60th annual meeting of the association for computational linguistics (Volume 1: Long Papers). pp. 2701\u20132714. Association for Computational Linguistics, Dublin, Ireland. https:\/\/doi.org\/10.18653\/v1\/2022.acl-long.193, https:\/\/aclanthology.org\/2022.acl-long.193","DOI":"10.18653\/v1\/2022.acl-long.193"},{"key":"6512_CR9","unstructured":"Gao, J., He, D., Tan, X., Qin, T., Wang, L., & Liu, T. (2019). Representation degeneration problem in training natural language generation models. In: International conference on learning representations. https:\/\/openreview.net\/forum?id=SkEYojRqtm"},{"key":"6512_CR10","doi-asserted-by":"publisher","unstructured":"Gao, T., Yao, X., & Chen, D. (2021). SimCSE: Simple contrastive learning of sentence embeddings. In: Proceedings of the 2021 conference on empirical methods in natural language processing. pp. 6894\u20136910. Association for Computational Linguistics, Online and Punta Cana, Dominican Republic. https:\/\/doi.org\/10.18653\/v1\/2021.emnlp-main.552, https:\/\/aclanthology.org\/2021.emnlp-main.552","DOI":"10.18653\/v1\/2021.emnlp-main.552"},{"key":"6512_CR11","unstructured":"Gunel, B., Du, J., Conneau, A., & Stoyanov, V. (2021). Supervised contrastive learning for pre-trained language model fine-tuning. In: International conference on learning representations. https:\/\/openreview.net\/forum?id=cu7IUiOhujH"},{"key":"6512_CR12","unstructured":"Lan, Z., Chen, M., Goodman, S., Gimpel, K., Sharma, P., & Soricut, R. (2019). Albert: A lite bert for self-supervised learning of language representations. arXiv preprint arXiv:1909.11942."},{"key":"6512_CR13","doi-asserted-by":"publisher","unstructured":"Li, B., Zhou, H., He, J., Wang, M., Yang, Y., & Li, L. (2020). On the sentence embeddings from pre-trained language models. In: Proceedings of the 2020 conference on empirical methods in natural language processing (EMNLP). pp. 9119\u20139130. Association for Computational Linguistics, Online. https:\/\/doi.org\/10.18653\/v1\/2020.emnlp-main.733, https:\/\/aclanthology.org\/2020.emnlp-main.733","DOI":"10.18653\/v1\/2020.emnlp-main.733"},{"key":"6512_CR14","doi-asserted-by":"publisher","unstructured":"Liu, F., Vuli\u0107, I., Korhonen, A., & Collier, N. (2021). Fast, effective, and self-supervised: Transforming masked language models into universal lexical and sentence encoders. In: Proceedings of the 2021 conference on empirical methods in natural language processing. pp. 1442\u20131459. Association for Computational Linguistics, Online and Punta Cana, Dominican Republic. https:\/\/doi.org\/10.18653\/v1\/2021.emnlp-main.109, https:\/\/aclanthology.org\/2021.emnlp-main.109","DOI":"10.18653\/v1\/2021.emnlp-main.109"},{"issue":"140","key":"6512_CR15","first-page":"1","volume":"21","author":"C Raffel","year":"2020","unstructured":"Raffel, C., Shazeer, N., Roberts, A., Lee, K., Narang, S., Matena, M., Zhou, Y., Li, W., Liu, P. J., et al. (2020). Exploring the limits of transfer learning with a unified text-to-text transformer. Journal of Machine Learning Research, 21(140), 1\u201367.","journal-title":"Journal of Machine Learning Research"},{"key":"6512_CR16","doi-asserted-by":"publisher","unstructured":"Rajpurkar, P., Zhang, J., Lopyrev, K., & Liang, P. (2016). SQuAD: 100,000+ questions for machine comprehension of text. In: Proceedings of the 2016 conference on empirical methods in natural language processing, pp. 2383\u20132392. Association for Computational Linguistics, Austin, Texas. https:\/\/doi.org\/10.18653\/v1\/D16-1264, https:\/\/aclanthology.org\/D16-1264","DOI":"10.18653\/v1\/D16-1264"},{"key":"6512_CR17","doi-asserted-by":"crossref","unstructured":"Socher, R., Perelygin, A., Wu, J., Chuang, J., Manning, C.D., Ng, A., & Potts, C. (2013). Recursive deep models for semantic compositionality over a sentiment treebank. In: Proceedings of the 2013 conference on empirical methods in natural language processing, pp. 1631\u20131642. Association for Computational Linguistics, Seattle, Washington, USA. https:\/\/aclanthology.org\/D13-1170","DOI":"10.18653\/v1\/D13-1170"},{"key":"6512_CR18","doi-asserted-by":"publisher","unstructured":"Su, Y., Liu, F., Meng, Z., Lan, T., Shu, L., Shareghi, E., & Collier, N. (2022). TaCL: Improving BERT pre-training with token-aware contrastive learning. In: Findings of the association for computational linguistics: NAACL 2022, pp. 2497\u20132507. Association for Computational Linguistics, Seattle, United States. https:\/\/doi.org\/10.18653\/v1\/2022.findings-naacl.191, https:\/\/aclanthology.org\/2022.findings-naacl.191","DOI":"10.18653\/v1\/2022.findings-naacl.191"},{"key":"6512_CR19","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A.N., Kaiser, L.u., & Polosukhin, I. (2017). Attention is all you need. In: Guyon, I., Luxburg, U.V., Bengio, S., Wallach, H., Fergus, R., Vishwanathan, S., & Garnett, R. (eds.) Advances in neural information processing systems, vol.\u00a030. Curran Associates, Inc. (2017), https:\/\/proceedings.neurips.cc\/paper\/2017\/file\/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf."},{"key":"6512_CR20","doi-asserted-by":"publisher","unstructured":"Wang, A., Singh, A., Michael, J., Hill, F., Levy, O., & Bowman, S. (2018). GLUE: A multi-task benchmark and analysis platform for natural language understanding. In: Proceedings of the 2018 EMNLP workshop BlackboxNLP: Analyzing and interpreting neural networks for NLP, pp. 353\u2013355. Association for Computational Linguistics, Brussels, Belgium. https:\/\/doi.org\/10.18653\/v1\/W18-5446, https:\/\/aclanthology.org\/W18-5446","DOI":"10.18653\/v1\/W18-5446"},{"key":"6512_CR21","unstructured":"Wang, L., Huang, J., Huang, K., Hu, Z., Wang, G., & Gu, Q. (2020). Improving neural language generation with spectrum control. In: International conference on learning representations. https:\/\/openreview.net\/forum?id=ByxY8CNtvr"},{"key":"6512_CR22","doi-asserted-by":"publisher","first-page":"625","DOI":"10.1162\/tacl_a_00290","volume":"7","author":"A Warstadt","year":"2019","unstructured":"Warstadt, A., Singh, A., & Bowman, S. R. (2019). Neural network acceptability judgments. Transactions of the Association for Computational Linguistics, 7, 625\u2013641.","journal-title":"Transactions of the Association for Computational Linguistics"},{"key":"6512_CR23","doi-asserted-by":"publisher","unstructured":"Williams, A., Nangia, N., & Bowman, S. (2018). A broad-coverage challenge corpus for sentence understanding through inference. In: Proceedings of the 2018 conference of the North American chapter of the association for computational linguistics: Human language technologies, Volume 1 (Long Papers). pp. 1112\u20131122. Association for Computational Linguistics, New Orleans, Louisiana. https:\/\/doi.org\/10.18653\/v1\/N18-1101, https:\/\/aclanthology.org\/N18-1101","DOI":"10.18653\/v1\/N18-1101"},{"key":"6512_CR24","doi-asserted-by":"publisher","unstructured":"Wolf, T., Debut, L., Sanh, V., Chaumond, J., Delangue, C., Moi, A., Cistac, P., Rault, T., Louf, R., Funtowicz, M., Davison, J., Shleifer, S., von Platen, P., Ma, C., Jernite, Y., Plu, J., Xu, C., Le\u00a0Scao, T., Gugger, S., Drame, M., Lhoest, Q., & Rush, A. (2020) Transformers: State-of-the-art natural language processing. In: Proceedings of the 2020 conference on empirical methods in natural language processing: System demonstrations, pp. 38\u201345. Association for Computational Linguistics, Online. https:\/\/doi.org\/10.18653\/v1\/2020.emnlp-demos.6, https:\/\/aclanthology.org\/2020.emnlp-demos.6","DOI":"10.18653\/v1\/2020.emnlp-demos.6"},{"key":"6512_CR25","doi-asserted-by":"publisher","unstructured":"Yan, Y., Li, R., Wang, S., Zhang, F., Wu, W., & Xu, W. (2021). ConSERT: A contrastive framework for self-supervised sentence representation transfer. In: Proceedings of the 59th annual meeting of the association for computational linguistics and the 11th international joint conference on natural language processing (Volume 1: Long Papers), pp. 5065\u20135075. Association for Computational Linguistics, Online. https:\/\/doi.org\/10.18653\/v1\/2021.acl-long.393, https:\/\/aclanthology.org\/2021.acl-long.393","DOI":"10.18653\/v1\/2021.acl-long.393"},{"key":"6512_CR26","unstructured":"Zhuang, L., Wayne, L., Ya, S., & Jun, Z. (2021). A robustly optimized bert pre-training approach with post-training. In: Proceedings of the 20th Chinese national conference on computational linguistics, pp. 1218\u20131227."}],"container-title":["Machine Learning"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10994-023-06512-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10994-023-06512-9\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10994-023-06512-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,27]],"date-time":"2025-11-27T18:06:23Z","timestamp":1764266783000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10994-023-06512-9"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,3,18]]},"references-count":26,"journal-issue":{"issue":"7","published-print":{"date-parts":[[2024,7]]}},"alternative-id":["6512"],"URL":"https:\/\/doi.org\/10.1007\/s10994-023-06512-9","relation":{},"ISSN":["0885-6125","1573-0565"],"issn-type":[{"value":"0885-6125","type":"print"},{"value":"1573-0565","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,3,18]]},"assertion":[{"value":"25 April 2023","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"17 October 2023","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"16 December 2023","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"18 March 2024","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors have no relevant financial or non-financial interests to disclose.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}},{"value":"Not applicable","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethics approval"}},{"value":"All authors agree to participate.","order":4,"name":"Ethics","group":{"name":"EthicsHeading","label":"Consent to participate"}},{"value":"All authors agree with the publication.","order":5,"name":"Ethics","group":{"name":"EthicsHeading","label":"Consent for publication"}},{"value":"This content has been made available to all.","name":"free","label":"Free to read"}]}}