{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,11]],"date-time":"2025-09-11T21:58:41Z","timestamp":1757627921750,"version":"3.44.0"},"publisher-location":"Cham","reference-count":32,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783032020871"},{"type":"electronic","value":"9783032020888"}],"license":[{"start":{"date-parts":[[2025,8,18]],"date-time":"2025-08-18T00:00:00Z","timestamp":1755475200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,8,18]],"date-time":"2025-08-18T00:00:00Z","timestamp":1755475200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-3-032-02088-8_17","type":"book-chapter","created":{"date-parts":[[2025,8,23]],"date-time":"2025-08-23T16:13:44Z","timestamp":1755965624000},"page":"237-253","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Analytics Modelling over\u00a0Multiple Datasets Using Vector Embeddings"],"prefix":"10.1007","author":[{"given":"Andreas","family":"Loizou","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Dimitrios","family":"Tsoumakos","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,8,18]]},"reference":[{"key":"17_CR1","unstructured":"Stock Market Dataset. Kaggle (2020). https:\/\/www.kaggle.com\/datasets\/jacksoncrow\/stock-market-dataset\/data"},{"key":"17_CR2","unstructured":"Weather Dataset. Kaggle (2020). https:\/\/www.kaggle.com\/datasets\/selfishgene\/historical-hourly-weather-data"},{"key":"17_CR3","doi-asserted-by":"crossref","unstructured":"Abdelaal, M., Yayak, A.B., Klede, K., Sch\u00f6ning, H.: Reclean: reinforcement learning for automated data cleaning in ml pipelines. In: 2024 IEEE 40th International Conference on Data Engineering Workshops (ICDEW), pp. 324\u2013330. IEEE (2024)","DOI":"10.1109\/ICDEW61823.2024.00048"},{"key":"17_CR4","doi-asserted-by":"crossref","unstructured":"Bakogiannis, T., Giannakopoulos, I., Tsoumakos, D., Koziris, N.: Apollo: a dataset profiling and operator modeling system. In: Proceedings of the 2019 International Conference on Management of Data, pp. 1869\u20131872 (2019)","DOI":"10.1145\/3299869.3320220"},{"key":"17_CR5","doi-asserted-by":"publisher","unstructured":"Becker, B., Kohavi, R.: Adult. UCI Machine Learning Repository (1996). https:\/\/doi.org\/10.24432\/C5XW20","DOI":"10.24432\/C5XW20"},{"key":"17_CR6","doi-asserted-by":"publisher","first-page":"2","DOI":"10.5334\/dsj-2015-002","volume":"14","author":"L Cai","year":"2015","unstructured":"Cai, L., Zhu, Y.: The challenges of data quality and data quality assessment in the big data era. Data Sci. J. 14, 2\u20132 (2015)","journal-title":"Data Sci. J."},{"key":"17_CR7","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2021.115053","volume":"180","author":"LV Dias","year":"2021","unstructured":"Dias, L.V., Miranda, P.B., Nascimento, A.C., Cordeiro, F.R., Mello, R.F., Prud\u00eancio, R.B.: Imagedataset2vec: an image dataset embedding for algorithm selection. Expert Syst. Appl. 180, 115053 (2021)","journal-title":"Expert Syst. Appl."},{"key":"17_CR8","unstructured":"Ghorbani, A., Zou, J.: Data shapley: equitable valuation of data for machine learning. In: International Conference on Machine Learning, pp. 2242\u20132251. PMLR (2019)"},{"key":"17_CR9","doi-asserted-by":"crossref","unstructured":"Giannakopoulos, I., Tsoumakos, D., Koziris, N.: A content-based approach for modeling analytics operators. In: Proceedings of the 27th ACM International Conference on Information and Knowledge Management, pp. 227\u2013236 (2018)","DOI":"10.1145\/3269206.3271731"},{"key":"17_CR10","first-page":"18932","volume":"34","author":"Y Gorishniy","year":"2021","unstructured":"Gorishniy, Y., Rubachev, I., Khrulkov, V., Babenko, A.: Revisiting deep learning models for tabular data. Adv. Neural. Inf. Process. Syst. 34, 18932\u201318943 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"17_CR11","unstructured":"Gupta, N., et\u00a0al.: Data quality toolkit: automatic assessment of data quality and remediation for machine learning datasets. arXiv preprint arXiv:2108.05935 (2021)"},{"key":"17_CR12","doi-asserted-by":"publisher","first-page":"72","DOI":"10.1016\/j.ijpe.2014.04.018","volume":"154","author":"BT Hazen","year":"2014","unstructured":"Hazen, B.T., Boone, C.A., Ezell, J.D., Jones-Farmer, L.A.: Data quality for data science, predictive analytics, and big data in supply chain management: an introduction to the problem and suggestions for research and applications. Int. J. Prod. Econ. 154, 72\u201380 (2014)","journal-title":"Int. J. Prod. Econ."},{"key":"17_CR13","doi-asserted-by":"publisher","unstructured":"Hebrail, G., Berard, A.: Individual Household Electric Power Consumption. UCI Machine Learning Repository (2006). https:\/\/doi.org\/10.24432\/C58K54","DOI":"10.24432\/C58K54"},{"key":"17_CR14","unstructured":"Huang, X., Khetan, A., Cvitkovic, M., Karnin, Z.: Tabtransformer: tabular data modeling using contextual embeddings. arXiv preprint arXiv:2012.06678 (2020)"},{"key":"17_CR15","doi-asserted-by":"crossref","unstructured":"Jakubik, J., V\u00f6ssing, M., K\u00fchl, N., Walk, J., Satzger, G.: Data-centric artificial intelligence. Bus. Inf. Syst. Eng. 1\u20139 (2024)","DOI":"10.1007\/s12599-024-00857-8"},{"issue":"3","key":"17_CR16","doi-asserted-by":"publisher","first-page":"964","DOI":"10.1007\/s10618-021-00737-9","volume":"35","author":"HS Jomaa","year":"2021","unstructured":"Jomaa, H.S., Schmidt-Thieme, L., Grabocka, J.: Dataset2vec: learning dataset meta-features. Data Min. Knowl. Disc. 35(3), 964\u2013985 (2021)","journal-title":"Data Min. Knowl. Disc."},{"key":"17_CR17","doi-asserted-by":"crossref","unstructured":"Kingma, D.P., Welling, M., et\u00a0al.: An introduction to variational autoencoders. Found. Trends\u00ae Mach. Learn. 12(4), 307\u2013392 (2019)","DOI":"10.1561\/2200000056"},{"issue":"1","key":"17_CR18","doi-asserted-by":"publisher","first-page":"79","DOI":"10.1214\/aoms\/1177729694","volume":"22","author":"S Kullback","year":"1951","unstructured":"Kullback, S., Leibler, R.A.: On information and sufficiency. Ann. Math. Stat. 22(1), 79\u201386 (1951)","journal-title":"Ann. Math. Stat."},{"issue":"2","key":"17_CR19","doi-asserted-by":"publisher","first-page":"129","DOI":"10.1109\/TIT.1982.1056489","volume":"28","author":"S Lloyd","year":"1982","unstructured":"Lloyd, S.: Least squares quantization in PCM. IEEE Trans. Inf. Theory 28(2), 129\u2013137 (1982)","journal-title":"IEEE Trans. Inf. Theory"},{"key":"17_CR20","doi-asserted-by":"crossref","unstructured":"Loizou, A., Tsoumakos, D.: Analytics modelling over multiple datasets using vector embeddings. arXiv preprint arXiv:2502.17060 (2025)","DOI":"10.1007\/978-3-032-02088-8_17"},{"key":"17_CR21","unstructured":"MacQueen, J., et\u00a0al.: Some methods for classification and analysis of multivariate observations. In: Proceedings of the Fifth Berkeley Symposium on Mathematical Statistics and Probability, Oakland, CA, USA, vol.\u00a01, pp. 281\u2013297 (1967)"},{"key":"17_CR22","unstructured":"Mikolov, T.: Efficient estimation of word representations in vector space. arXiv preprint arXiv:1301.37813781 (2013)"},{"key":"17_CR23","doi-asserted-by":"crossref","unstructured":"Murray, D.G., Simsa, J., Klimovic, A., Indyk, I.: tf. data: a machine learning data processing framework. arXiv preprint arXiv:2101.12127 (2021)","DOI":"10.14778\/3476311.3476374"},{"key":"17_CR24","unstructured":"Narayanan, A., Chandramohan, M., Venkatesan, R., Chen, L., Liu, Y., Jaiswal, S.: graph2vec: learning distributed representations of graphs. arXiv preprint arXiv:1707.05005 (2017)"},{"key":"17_CR25","doi-asserted-by":"crossref","unstructured":"Ni, W., Zhang, K., Miao, X., Zhao, X., Wu, Y., Yin, J.: Iterclean: an iterative data cleaning framework with large language models. In: Proceedings of the ACM Turing Award Celebration Conference-China 2024, pp. 100\u2013105 (2024)","DOI":"10.1145\/3674399.3674436"},{"key":"17_CR26","unstructured":"Rezig, E.K., et al.: Dagger: a data (not code) debugger. In: CIDR 2020, 10th Conference on Innovative Data Systems Research, Amsterdam, The Netherlands, January 12-15, 2020, Online Proceedings (2020)"},{"key":"17_CR27","doi-asserted-by":"publisher","first-page":"53","DOI":"10.1016\/0377-0427(87)90125-7","volume":"20","author":"PJ Rousseeuw","year":"1987","unstructured":"Rousseeuw, P.J.: Silhouettes: a graphical aid to the interpretation and validation of cluster analysis. J. Comput. Appl. Math. 20, 53\u201365 (1987)","journal-title":"J. Comput. Appl. Math."},{"key":"17_CR28","first-page":"2902","volume":"35","author":"Z Wang","year":"2022","unstructured":"Wang, Z., Sun, J.: Transtab: learning transferable tabular transformers across tables. Adv. Neural. Inf. Process. Syst. 35, 2902\u20132915 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"17_CR29","unstructured":"Xiong, R., et al.: On layer normalization in the transformer architecture. In: International Conference on Machine Learning, pp. 10524\u201310533. PMLR (2020)"},{"issue":"5","key":"17_CR30","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3711118","volume":"57","author":"D Zha","year":"2025","unstructured":"Zha, D., et al.: Data-centric artificial intelligence: a survey. ACM Comput. Surv. 57(5), 1\u201342 (2025)","journal-title":"ACM Comput. Surv."},{"key":"17_CR31","doi-asserted-by":"crossref","unstructured":"Zhang, L., Zhang, S., Balog, K.: Table2vec: neural word and entity embeddings for table population and retrieval. In: Proceedings of the 42nd International ACM SIGIR Conference on Research and Development in Information Retrieval, pp. 1029\u20131032 (2019)","DOI":"10.1145\/3331184.3331333"},{"key":"17_CR32","doi-asserted-by":"publisher","unstructured":"Zhu, C., Zhang, Q., Cao, L., Abrahamyan, A.: Mix2vec: unsupervised mixed data representation. In: 2020 IEEE 7th International Conference on Data Science and Advanced Analytics (DSAA), pp. 118\u2013127 (2020). https:\/\/doi.org\/10.1109\/DSAA49011.2020.00024","DOI":"10.1109\/DSAA49011.2020.00024"}],"container-title":["Lecture Notes in Computer Science","Database and Expert Systems Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-032-02088-8_17","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,10]],"date-time":"2025-09-10T05:15:28Z","timestamp":1757481328000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-032-02088-8_17"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,8,18]]},"ISBN":["9783032020871","9783032020888"],"references-count":32,"URL":"https:\/\/doi.org\/10.1007\/978-3-032-02088-8_17","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2025,8,18]]},"assertion":[{"value":"18 August 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"DEXA","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Database and Expert Systems Applications","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Bangkok","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Thailand","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"25 August 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27 August 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"36","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"dexa2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/www.dexa.org\/2025\/dexa2025.html","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}