数据质量监控的一些思路和方案 最后更新时间:2024年02月04日 #### 基于系统本身 1. 可以基于系统本身的数据,收集大量的元数据,再组合成或sql或python或hive或spark的代码,批量的对元数据进行查询和计算,以达到监控数据元信息的目的。 #### 导出数据 1. 通过python读取excel文件,分析数据的格式是否符合模板下的格式,如果不符合则记录下来。 示例: ```python import pandas as pd from pandas import DataFrame def digit20(digit:str,allow_symbols:list): """digit20 将字符串中的数字批量转换成0 Keyword arguments: digit -- 字符串 allow_symbols -- 允许存在的符号 Return: str """ s = "" for v in str(digit): try: if v in allow_symbols: s += v continue else: v = int(v) s += "0" except: raise Exception(f"{v}不是合法标识符") return s def formatLanToformat(formatLan:str|list): """formatLanToformat 将format模板语法转换为模板 symbols是用于存放关键字的位置,用于放置一些语法的特殊标识符或特殊语法 如: | -- 竖线符号表示了或,当语法中出现该符号`|`时,则字符可以既可以匹配符号前一个字符,也可以匹配竖线后一个字符。 \ -- 斜线符号表示了转义,当语法中出现了该符号`\`时,则符号下一个字符直接作为字符放入,不认定为关键字。 Keyword arguments: formatLan -- format模板语法或format模板语法列表 Return: format -- list format模板 """ symbols = ["|"] format_s_s = [] def analysisSymbol(formatL,now_l=""): format_s = [] num = 0 format_s.extend(list(now_l)) formatL = list(formatL) for v in formatL: if v not in symbols: format_s.append(v) elif v == "|": analysisSymbol(formatL[(num+2)::],format_s) format_s = format_s[0:len(format_s)-1] analysisSymbol(formatL[(num+1)::],format_s) break # elif v == "\\": # 该方法目前废弃,疑似与python语法有所冲突 # format_s.append(formatL[num+1]) # analysisSymbol(formatL[(num+2)::],format_s) # break num += 1 else: format_s = "".join(format_s) format_s_s.append(format_s) if type(formatLan) == str: data = analysisSymbol(formatLan) if type(formatLan) == list: for format_v in formatLan: data = analysisSymbol(format_v) format_s_s = list(set(format_s_s)) return format_s_s def analysis(df:DataFrame,format_list:list,allow_symbols:list,field_querys:str|list) -> list: """analysis 用于分析找到不符合规定的字段的信息 Keyword arguments: df -- pandas的datafram format_list -- 模板列表 allow_symbols -- 模板关键字 field_query -- 字段筛选条件 Return: return_description """ format_list_chace = [] list(map(format_list_chace.extend,list(map(formatLanToformat,format_list)))) print(format_list_chace) illegal_dfs = [] s = 0 for num in range(0,len(df)): s+=1 table_name = df["table_name"][num] table_comment = df["table_comment"][num] field = df["filed"][num] value = df["field_value"][num] filed_comment = df["filed_comment"][num] if type(field_querys) == str: if field_query not in str(field) or field_query not in str(filed_comment): continue else: try: digit = digit20(value,allow_symbols) if digit in format_list_chace: illegal_dfs.append({"table_name":table_name,"table_comment":table_comment,"filed":field,"value":value,"filed_comment":filed_comment}) except Exception: illegal_dfs.append({"table_name":table_name,"table_comment":table_comment,"filed":field,"value":value,"filed_comment":filed_comment}) elif type(field_querys) == list: num = len(field_querys) js_num = 0 for field_query in field_querys: if field_query in str(field) or field_query in str(filed_comment): js_num = 1 if js_num == 1: try: digit = digit20(value,allow_symbols) if digit not in format_list_chace: illegal_dfs.append({"table_name":table_name,"table_comment":table_comment,"filed":field,"value":value,"filed_comment":filed_comment}) except Exception: illegal_dfs.append({"table_name":table_name,"table_comment":table_comment,"filed":field,"value":value,"filed_comment":filed_comment}) return illegal_dfs if __name__ == "__main__": phone_format = ["00000000000","000-00000"] phone_allow_sysbols = [] date_format = ["0000/|\\00/|\\00 00:00:00:00", "0000/|\\00/|\\00 00:00:00", "0000-00-00 00:00:00:00", "0000-00-00 00:00:00", "0000.00.00 00:00:00:00", "0000.00.00 00:00:00", "0000/|\\|年|-|.0月|", "0000/|\\|年|-|.00月|", "0000/|\\|年|-|.0/|\\|月|-|.0日|", "0000/|\\|年|-|.00/|\\|月|-|.0日|", "0000/|\\|年|-|.00/|\\|月|-|.00日|" ] date_allow_sysbols = ["/","\\","-",":","."," "] info_format = ["000000000000000000|x","000000000000000"] info_allow_sysbols = ["x","X"] df = pd.read_excel("./input/tmpyxxtbale.xlsx",sheet_name="Sheet1") # 联系电话","联系方式","电话","手机","号码 # datas = analysis(df,date_format,date_allow_sysbols,["时间","日期"]) datas = analysis(df,info_format,info_allow_sysbols,["号码","身份证","身份","证件号","号"]) print(datas) df = pd.DataFrame(datas) df.to_excel("./output/data.xlsx") ```
Comments | NOTHING