{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,11]],"date-time":"2026-05-11T22:53:56Z","timestamp":1778540036434,"version":"3.51.4"},"reference-count":44,"publisher":"Association for Computing Machinery (ACM)","issue":"11","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":["Proc. VLDB Endow."],"published-print":{"date-parts":[[2024,7]]},"abstract":"<jats:p>\n            Data dependency mining plays a crucial role in understanding data relationships. To address the increasing complexities of real-world data, Approximate Functional Dependencies (AFDs) have been introduced, building upon traditional FD. However, existing AFD approaches use static relaxation coefficients, limiting their effectiveness in capturing dependencies in noisy data. We propose a dynamic AFD variant, DAFD, which incorporates attribute error rates. We establish a bijection between DAFD and FD, develop its inference system, and introduce DAFDiscover, an algorithm for mining dependencies directly on noisy data. DAFDiscover matches the time and space complexity of SOTA AFD mining methods while offering superior performance. We theoretically prove its correctness, provide a method for calculating DAFD probabilities (DAFD-\n            <jats:italic>prob<\/jats:italic>\n            ), and derive a lower bound for DAFD's validity on dirty data. Experimental results on multiple public datasets demonstrate the semantic superiority of DAFD and the effectiveness of DAFDiscover compared to existing SOTA AFD mining techniques.\n          <\/jats:p>","DOI":"10.14778\/3681954.3682015","type":"journal-article","created":{"date-parts":[[2024,8,30]],"date-time":"2024-08-30T16:23:36Z","timestamp":1725035016000},"page":"3484-3496","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":4,"title":["DAFDiscover: Robust Mining Algorithm for Dynamic Approximate Functional Dependencies on Dirty Data"],"prefix":"10.14778","volume":"17","author":[{"given":"Xiaoou","family":"Ding","sequence":"first","affiliation":[{"name":"Harbin Institute of Technology"}]},{"given":"Yixing","family":"Lu","sequence":"additional","affiliation":[{"name":"Harbin Institute of Technology"}]},{"given":"Hongzhi","family":"Wang","sequence":"additional","affiliation":[{"name":"Harbin Institute of Technology"}]},{"given":"Chen","family":"Wang","sequence":"additional","affiliation":[{"name":"Tsinghua University, China"}]},{"given":"Yida","family":"Liu","sequence":"additional","affiliation":[{"name":"Harbin Institute of Technology"}]},{"given":"Jianmin","family":"Wang","sequence":"additional","affiliation":[{"name":"Tsinghua University, China"}]}],"member":"320","published-online":{"date-parts":[[2024,8,30]]},"reference":[{"key":"e_1_2_1_1_1","doi-asserted-by":"publisher","unstructured":"2020. BitcoinHeistRansomwareAddressDataset. UCI Machine Learning Repository. 10.24432\/C5BG8V","DOI":"10.24432\/C5BG8V"},{"key":"e_1_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/2661829.2661884"},{"key":"e_1_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.14778\/2850578.2850579"},{"key":"e_1_2_1_4_1","doi-asserted-by":"publisher","unstructured":"Michael Bain and Arthur Hoff. 1994. Chess (King-Rook vs. King). UCI Machine Learning Repository. 10.24432\/C57W2S","DOI":"10.24432\/C57W2S"},{"key":"e_1_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/2983323.2983781"},{"key":"e_1_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/TKDE.2020.2967722"},{"key":"e_1_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/TKDE.2015.2472010"},{"key":"e_1_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10618-019-00667-7"},{"key":"e_1_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.14778\/1453856.1453980"},{"key":"e_1_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.24432\/C5660T"},{"key":"e_1_2_1_11_1","unstructured":"codocedo. 2018. tane. https:\/\/github.com\/codocedo\/tane."},{"key":"e_1_2_1_12_1","doi-asserted-by":"publisher","unstructured":"Paulo Cortez and Anbal Morais. 2008. Forest Fires. UCI Machine Learning Repository. 10.24432\/C5D88D","DOI":"10.24432\/C5D88D"},{"key":"e_1_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDE60146.2024.00282"},{"key":"e_1_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDE60146.2024.00271"},{"key":"e_1_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.23919\/ICN.2022.0026"},{"key":"e_1_2_1_16_1","volume-title":"Foundations of Data Quality Management","author":"Fan Wenfei","unstructured":"Wenfei Fan and Floris Geerts. 2012. Foundations of Data Quality Management. Morgan & Claypool Publishers."},{"key":"e_1_2_1_17_1","doi-asserted-by":"publisher","unstructured":"Hadi Fanaee-T. 2013. Bike Sharing Dataset. UCI Machine Learning Repository. 10.24432\/C5W894","DOI":"10.24432\/C5W894"},{"key":"e_1_2_1_18_1","first-page":"139","article-title":"Database Dependency Discovery","volume":"12","author":"Flach Peter A.","year":"1999","unstructured":"Peter A. Flach and Iztok Savnik. 1999. Database Dependency Discovery: A Machine Learning Approach. AI Commun. 12, 3 (1999), 139--160. http:\/\/content.iospress.com\/articles\/ai-communications\/aic182","journal-title":"A Machine Learning Approach. AI Commun."},{"key":"e_1_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1093\/COMJNL\/42.2.100"},{"key":"e_1_2_1_20_1","volume-title":"Ilyas and Xu Chu","author":"Ihab","year":"2019","unstructured":"Ihab F. Ilyas and Xu Chu. 2019. Data Cleaning. ACM."},{"key":"e_1_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/1007568.1007641"},{"key":"e_1_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.jnca.2016.08.002"},{"key":"e_1_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.14778\/3192965.3192968"},{"key":"e_1_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDE55515.2023.00220"},{"key":"e_1_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/TKDE.2010.197"},{"key":"e_1_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1007\/3-540-46439-5_24"},{"key":"e_1_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/3097983.3098062"},{"key":"e_1_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDM.2018.00047"},{"key":"e_1_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1023\/A:1009748302351"},{"key":"e_1_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.24432\/C5BS66"},{"key":"e_1_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.24432\/C55C7W"},{"key":"e_1_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1007\/3-540-44503-X_13"},{"key":"e_1_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/2882903.2915203"},{"key":"e_1_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394486.3403178"},{"key":"e_1_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/TKDE.2020.3046443"},{"key":"e_1_2_1_36_1","volume-title":"IoT Data Quality. In the 29th ACM International Conference on Information and Knowledge Management CIKM. 3517--3518","author":"Song Shaoxu","year":"2020","unstructured":"Shaoxu Song and Aoqian Zhang. 2020. IoT Data Quality. In the 29th ACM International Conference on Information and Knowledge Management CIKM. 3517--3518."},{"key":"e_1_2_1_37_1","doi-asserted-by":"publisher","unstructured":"Saverio Vito. 2016. Air Quality. UCI Machine Learning Repository. 10.24432\/C59K5F","DOI":"10.24432\/C59K5F"},{"key":"e_1_2_1_38_1","volume-title":"Functional Dependency Generation and Applications in Pay-As-You-Go Data Integration Systems. In 12th International Workshop on the Web and Databases, WebDB 2009","author":"Wang Daisy Zhe","year":"2009","unstructured":"Daisy Zhe Wang, Xin Luna Dong, Anish Das Sarma, Michael J. Franklin, and Alon Y. Halevy. 2009. Functional Dependency Generation and Applications in Pay-As-You-Go Data Integration Systems. In 12th International Workshop on the Web and Databases, WebDB 2009, Providence, Rhode Island, USA, June 28, 2009. http:\/\/webdb09.cse.buffalo.edu\/papers\/Paper18\/webdb09.pdf"},{"key":"e_1_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2019.2962152"},{"key":"e_1_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDE.2019.00137"},{"key":"e_1_2_1_41_1","doi-asserted-by":"publisher","unstructured":"WIlliam Wolberg. 1992. Breast Cancer Wisconsin (Original). UCI Machine Learning Repository. 10.24432\/C5HP4Z","DOI":"10.24432\/C5HP4Z"},{"key":"e_1_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1007\/3-540-44801-2_11"},{"key":"e_1_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDM.2002.1184040"},{"key":"e_1_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1145\/3318464.3389749"}],"container-title":["Proceedings of the VLDB Endowment"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.14778\/3681954.3682015","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,9,4]],"date-time":"2024-09-04T18:31:30Z","timestamp":1725474690000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.14778\/3681954.3682015"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,7]]},"references-count":44,"journal-issue":{"issue":"11","published-print":{"date-parts":[[2024,7]]}},"alternative-id":["10.14778\/3681954.3682015"],"URL":"https:\/\/doi.org\/10.14778\/3681954.3682015","relation":{},"ISSN":["2150-8097"],"issn-type":[{"value":"2150-8097","type":"print"}],"subject":[],"published":{"date-parts":[[2024,7]]},"assertion":[{"value":"2024-08-30","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}