基于系统本身
- 可以基于系统本身的数据,收集大量的元数据,再组合成或sql或python或hive或spark的代码,批量的对元数据进行查询和计算,以达到监控数据元信息的目的。
导出数据
- 通过python读取excel文件,分析数据的格式是否符合模板下的格式,如果不符合则记录下来。
示例:
import pandas as pd
from pandas import DataFrame
def digit20(digit:str,allow_symbols:list):
"""digit20
将字符串中的数字批量转换成0
Keyword arguments:
digit -- 字符串
allow_symbols -- 允许存在的符号
Return: str
"""
s = ""
for v in str(digit):
try:
if v in allow_symbols:
s += v
continue
else:
v = int(v)
s += "0"
except:
raise Exception(f"{v}不是合法标识符")
return s
def formatLanToformat(formatLan:str|list):
"""formatLanToformat
将format模板语法转换为模板
symbols是用于存放关键字的位置,用于放置一些语法的特殊标识符或特殊语法
如:
| -- 竖线符号表示了或,当语法中出现该符号`|`时,则字符可以既可以匹配符号前一个字符,也可以匹配竖线后一个字符。
\ -- 斜线符号表示了转义,当语法中出现了该符号`\`时,则符号下一个字符直接作为字符放入,不认定为关键字。
Keyword arguments:
formatLan -- format模板语法或format模板语法列表
Return: format -- list format模板
"""
symbols = ["|"]
format_s_s = []
def analysisSymbol(formatL,now_l=""):
format_s = []
num = 0
format_s.extend(list(now_l))
formatL = list(formatL)
for v in formatL:
if v not in symbols:
format_s.append(v)
elif v == "|":
analysisSymbol(formatL[(num+2)::],format_s)
format_s = format_s[0:len(format_s)-1]
analysisSymbol(formatL[(num+1)::],format_s)
break
# elif v == "\\": # 该方法目前废弃,疑似与python语法有所冲突
# format_s.append(formatL[num+1])
# analysisSymbol(formatL[(num+2)::],format_s)
# break
num += 1
else:
format_s = "".join(format_s)
format_s_s.append(format_s)
if type(formatLan) == str:
data = analysisSymbol(formatLan)
if type(formatLan) == list:
for format_v in formatLan:
data = analysisSymbol(format_v)
format_s_s = list(set(format_s_s))
return format_s_s
def analysis(df:DataFrame,format_list:list,allow_symbols:list,field_querys:str|list) -> list:
"""analysis
用于分析找到不符合规定的字段的信息
Keyword arguments:
df -- pandas的datafram
format_list -- 模板列表
allow_symbols -- 模板关键字
field_query -- 字段筛选条件
Return: return_description
"""
format_list_chace = []
list(map(format_list_chace.extend,list(map(formatLanToformat,format_list))))
print(format_list_chace)
illegal_dfs = []
s = 0
for num in range(0,len(df)):
s+=1
table_name = df["table_name"][num]
table_comment = df["table_comment"][num]
field = df["filed"][num]
value = df["field_value"][num]
filed_comment = df["filed_comment"][num]
if type(field_querys) == str:
if field_query not in str(field) or field_query not in str(filed_comment):
continue
else:
try:
digit = digit20(value,allow_symbols)
if digit in format_list_chace:
illegal_dfs.append({"table_name":table_name,"table_comment":table_comment,"filed":field,"value":value,"filed_comment":filed_comment})
except Exception:
illegal_dfs.append({"table_name":table_name,"table_comment":table_comment,"filed":field,"value":value,"filed_comment":filed_comment})
elif type(field_querys) == list:
num = len(field_querys)
js_num = 0
for field_query in field_querys:
if field_query in str(field) or field_query in str(filed_comment):
js_num = 1
if js_num == 1:
try:
digit = digit20(value,allow_symbols)
if digit not in format_list_chace:
illegal_dfs.append({"table_name":table_name,"table_comment":table_comment,"filed":field,"value":value,"filed_comment":filed_comment})
except Exception:
illegal_dfs.append({"table_name":table_name,"table_comment":table_comment,"filed":field,"value":value,"filed_comment":filed_comment})
return illegal_dfs
if __name__ == "__main__":
phone_format = ["00000000000","000-00000"]
phone_allow_sysbols = []
date_format = ["0000/|\\00/|\\00 00:00:00:00",
"0000/|\\00/|\\00 00:00:00",
"0000-00-00 00:00:00:00",
"0000-00-00 00:00:00",
"0000.00.00 00:00:00:00",
"0000.00.00 00:00:00",
"0000/|\\|年|-|.0月|",
"0000/|\\|年|-|.00月|",
"0000/|\\|年|-|.0/|\\|月|-|.0日|",
"0000/|\\|年|-|.00/|\\|月|-|.0日|",
"0000/|\\|年|-|.00/|\\|月|-|.00日|"
]
date_allow_sysbols = ["/","\\","-",":","."," "]
info_format = ["000000000000000000|x","000000000000000"]
info_allow_sysbols = ["x","X"]
df = pd.read_excel("./input/tmpyxxtbale.xlsx",sheet_name="Sheet1")
# 联系电话","联系方式","电话","手机","号码
# datas = analysis(df,date_format,date_allow_sysbols,["时间","日期"])
datas = analysis(df,info_format,info_allow_sysbols,["号码","身份证","身份","证件号","号"])
print(datas)
df = pd.DataFrame(datas)
df.to_excel("./output/data.xlsx")
Comments | NOTHING