您的位置:首页 > 编程语言 > Python开发

基于python写的专门用于字符串匹配的smartscript实现代码

2012-04-30 01:25 1091 查看
采用csv文件保存脚本,程序启动时就读取指定的csv文件,然后根据脚本对控制台输入的文本数据进行匹配,并输出结果或者执行python语句。

# -*- coding: utf-8 -*-

import csv,sys,re,string

###################################################################################################
#全局遍历定义
###################################################################################################

#当前的所有rule 一条规则的内容:{name, sub_rules, script, result}
#sub_rules结构:[[op, match_content],...]
#result结构:[[op,result_content],...]
global_rules = []

#读取的所有内容行
global_contents = []

#配置文件中的正则表达式
#["代码","正则表达式"]
global_config_regex = []
global_config_regex_str = [""]

#输入参数之一:是否支持去重复项,即如果输出信息中,有多行内容是完全相同的,则只输出一次
sys_args = {"single_result":True}
arg_allow_same_help = []
arg_allow_same_script_help = []

###################################################################################################
#符号定义说明
###################################################################################################

#文本输入结束符
END_INPUT_FLAG = "[---END---]"

#文本数据替代符号
MATCHED_LINE_FLAG = "__MATCHED_LINE"
LINE_ANY_STR = "__LINE_ANY_STR"
ANY_NUM_LINE = "__ANY_NUM_LINE-"
ANY_NUM_LINE2 = "__ANY_NUM_LINE-"

#匹配的关键字
RULE_KEY_INCLUDE = "include"
RULE_KEY_INCLUDEOR = "include-or"
RULE_KEY_EQUAL = "equal"
RULE_KEY_PATTERN = "pattern"

###################################################################################################
#函数定义
###################################################################################################

###################################################################################################
#主函数
def main():
	if len(sys.argv) <= 1:
		print "smartscript script_file"
		return
	script_file = sys.argv[1]

	if len(sys.argv) == 3:
		if sys.argv[2] == "-s": #不允许出现重复项
			sys_args["single_result"] = False

	read_regex(global_config_regex, global_config_regex_str) #读取正则表达式配置
	if False == read_rules(script_file):
		return
	if False == read_content():
		return
	do_parse(global_rules, global_contents)

#执行文本解析		   
def do_parse(rules_all, contents_all):
	#循环遍历脚本
	for rule in rules_all:
		#针对每一个规则去匹配所有数据文本
		single_rule_match(rule, contents_all)
	return

###################################################################################################
#对一个规则匹配所有数据文本	
def single_rule_match(rule, contents_all):
	#根据规则不同,执行各自的匹配方法
	sub_rule = rule["sub_rules"]
	sub_rule_type = sub_rule[0][0]
	arg_cache0 = [] #参数存储器0,用于保存输入参数
	arg_cache1 = [] #参数存储器1,用于保存匹配的结果
	arg_item = {"matched_line":[], "values":{}}
	if RULE_KEY_INCLUDE == sub_rule_type:
		#include规则匹配
		do_include_match(rule, contents_all, arg_cache0, arg_cache1)
	elif RULE_KEY_INCLUDEOR == sub_rule_type:
		#include_or规则匹配
		return
	elif RULE_KEY_EQUAL == sub_rule_type:
		#equal规则匹配
		do_equal_match(rule, contents_all, arg_cache0, arg_cache1)
	elif RULE_KEY_PATTERN == sub_rule_type:
		#pattern规则匹配
		do_equal_match(rule, contents_all, arg_cache0, arg_cache1)
	else:
		return
		
	#匹配成功则执行结果
	if(len(arg_cache1) > 0):
#		print len(arg_cache1)
#		print "---------------------------------"
		do_result1(rule, arg_cache1)
	return

###################################################################################################
#匹配成功后执行结果,为了脚本的中间变量能在result中使用,
#执行脚本和输出结果写在同一个函数中
def do_result1(rule, arg_cache1):
#	print arg_cache1
	for args in arg_cache1:
		_VALUE_ = args["values"]
		matched_line = args["matched_line"]
#scripts = rule["script"]
		results = rule["result"]
		#先执行脚本
#		for s in scripts:
#			exec(s)
		#再执行结果
		for r in results:
			if "print" == r[0]:
				_MATCHED_LINE_ = matched_line[0]
				result = r[1]
				for i in range(0,len(matched_line)):
					if i != 0:
						_MATCHED_LINE_ = _MATCHED_LINE_ + "\r\n" + matched_line[i]
				result = result.replace(MATCHED_LINE_FLAG, _MATCHED_LINE_) #替换命中行关键字

				#过滤重复项
				if True == sys_args["single_result"]:
					for t in arg_allow_same_help:
						if t == result:
							return
				arg_allow_same_help.append(result)

				#输出
				print result
			if "script" == r[0]:
				#过滤重复项
				if True == sys_args["single_result"]:
					for t in arg_allow_same_script_help:
						if t[0] == r[1] and is_list_same(t[1], args):
							return
				tmp = [r[1], args]
				arg_allow_same_script_help.append(tmp)

				#执行脚本
				exec(r[1])
	return

#比较两个list是否相同	
def is_list_same(a,b):
    if len(a) != len(b):
        return False
    c = 0
    for t1 in a:
        for t2 in b:
            if t1 == t2 and a[t1] == b[t2]:
                c = c + 1
                break
    if c == len(a):
        return True
    return False

###################################################################################################
#include规则匹配
def do_include_match(rule, contents_all, arg_cache0, arg_cache1):
	sub_rule = rule["sub_rules"]
	rule_item = sub_rule[0][1]
	arg_item = {"matched_line":[], "values":{}}
	for line in contents_all:
		if -1 != line.find(rule_item):
			arg_item["matched_line"].append(line)
			arg_cache1.append(arg_item)
	return

###################################################################################################
#equal规则匹配,equal规则是每行完全匹配
def do_equal_match(rule, contents_all, arg_cache0, arg_cache1):
	sub_rule = rule["sub_rules"]
	rule_tmp = get_lines(sub_rule[0][1])					
	rule_type = sub_rule[0][0]
	lines_tmp = []

	for line in contents_all:	
		lines_tmp.append(line)

		any_num_line = 0
		next_match_rule_line = 0 

		if len(lines_tmp) >= len(rule_tmp):
			c = 0
			j = 0
			matched = False

			arg_item = {"matched_line":[], "values":{}}

			#开始匹配
			for i in range(0,len(lines_tmp)): 
				#先做ANY_NUM_LINE匹配,如果rule为ANY_NUM_LINE,则获取最大跳过的行数,
				#直接匹配下一个rule行,如果匹配成功则跳到下下个rule行比较	
#				print i,j,len(rule_tmp)
#				print rule_tmp[j]
				if 0 == any_num_line and -1 != rule_tmp[j].find(ANY_NUM_LINE):
					t = rule_tmp[j].split("-")
					if len(t) >= 2:
						any_num_line = string.atoi(t[1]) + 2  #获取最大跳过行数
						next_match_rule_line = j + 1  #下一个rule行的下标
						if next_match_rule_line >= len(rule_tmp):
							next_match_rule_line = 0
				if any_num_line > 0: #控制跳过行数
					any_num_line = any_num_line - 1

				if 0 != any_num_line and 0 != next_match_rule_line: #条件满足的情况下进行下一个rule行匹配
#					print lines_tmp[i],rule_tmp[next_match_rule_line]
					if True == is_match(lines_tmp[i], rule_tmp[next_match_rule_line], rule_type, arg_item["values"]):
						j = j + 2
#						print "match:",j,len(rule_tmp)
						next_match_rule_line = 0
						any_num_line = 0
						if j == len(rule_tmp): #命中最后一个rule行,则认为完全匹配成功
							matched = True
							break
						#下一个rule行匹配匹配成功,则清空ANY_NUM_LINE
					continue

				#非ANY_NUM_LINE比较,这个是逐行比较,必须每行都一致			
				if False == is_match(lines_tmp[i], rule_tmp[j], rule_type, arg_item["values"]):
					lines_tmp.pop(0)
					c = -1
					break
				else:
					if j == len(rule_tmp) - 1:
						matched = True
						break
					j = j + 1 #匹配成功一个,rule就向后移动一个
				c = c + 1
		
			if matched: #匹配成功
#	print lines_tmp
				for tmp in lines_tmp:
					arg_item["matched_line"].append(tmp)
				arg_cache1.append(arg_item)
				del lines_tmp[0:len(lines_tmp)] #清空lines_tmp

	return

###################################################################################################
#比较一行是否匹配,支持equal和pattern		
def is_match(line, rule, type, values):
	if LINE_ANY_STR == rule:
		return True
	if RULE_KEY_EQUAL == type and line == rule:
		return True
	if RULE_KEY_PATTERN == type and pattern_line_match(rule, line, values):
		return True
	return False

###################################################################################################
#格式化输入的文本数据,去除一些不需要的字符
def line_format(line):
	#前后空格要去除
	line = line.strip()
	return line

###################################################################################################
#格式化多行,将\r\n改为\n,将每行前后的空格去除,并去除空行
def mutiline_format(txt):
	txt = txt.replace("\r\n", "\n")
	tmp = txt.split("\n")
	txt = ""
	line_count = 0
	for t in tmp:
		t = t.strip()
		if t == "":
			continue
		if "" == txt:
			txt = t
		else:
			txt = txt + "\n" + t
	return txt

###################################################################################################
#读取脚本文件
def read_rules(rules_file):
	fp = csv.reader(open(rules_file,"rb"))
	if None == fp:
		return False
	try:
		i = 0
		for row in fp:
			#跳过第一行
			if 0 == i:
				i = i + 1
				continue
			if check_rule(row):
				add_rule(row)
			else:
				print "add rule fail:",row
	except csv.Error,e:
		print e
		return False
	return True

###################################################################################################
#读取正则表达式的配置文件
def read_regex(regex, regex_str):
	c = 0
	fp = csv.reader(open("smartscript/pattern.csv", "rb"))
	if None == fp:
		print "正则表达式文件损坏"
		return
	for row in fp:
		if 0 == c:  #第一行为注释行,直接跳过
			c = c + 1
			continue
		c = c + 1
		config_item = [row[0], row[1]]
		if "" == regex_str[0]:
			regex_str[0] = row[0]
		else:
			regex_str[0] = regex_str[0] + "|" + row[0]
		regex.append(config_item)
	return

###################################################################################################
#添加rule到global_rules中去
def add_rule(line):
	code = get_regex_str(mutiline_format(line[2]), global_config_regex, global_config_regex_str[0])
	sub_rule = [line[1], code]
	sub_result = [line[3], line[4]]
#	sub_script = line[5]
	
	#查找同名的rule
	for item in global_rules:
		if item["name"] == line[0]:	
			#如果存在,则当做同一个rule的sub_rule
			item["sub_rules"].append(sub_rule)
			item["result"].append(sub_result)
#			item["script"].append(sub_script)
			return True

	#不存在则当做新的插入
#rule = {"name":line[0], "sub_rules":[sub_rule], "script":[sub_script], "result":[sub_result]}
	rule = {"name":line[0], "sub_rules":[sub_rule], "result":[sub_result]}
	global_rules.append(rule)
	return True

###################################################################################################
#检查脚本是否合法
def check_rule(script):
	return True

###################################################################################################
#从标准输入流读取需要分析的文本数
def read_content():
	while True:
		line = raw_input()
		line = line_format(line)
		if line == "":
			continue
		if END_INPUT_FLAG == line:
			return True
		global_contents.append(line)
	return True	  

###################################################################################################
#正则表达式匹配
def pattern_line_match(rule, line, rule_reg2):
	if None == rule or None == line:
		return False
	rule_pattern = re.compile(rule)
	if None == rule_pattern:
		return False
	matched = re.match(rule_pattern, line)
	if None == matched:
		return False
	keys = []
	get_key(keys, rule)
	for k in keys:
		a1,a2 = is_integer(matched.group(k))
		rule_reg2[k] = a2
	return True

###################################################################################################
#从规则中取出关键字
def get_key(value_list, pattern_str):
    f1 = ''
    f2 = 0
    f3 = 0
    for i in range(0, len(pattern_str)):
        #print f1,f2,f3,i,pattern_str[i]
        if pattern_str[i] == '(':
            f1 = '('
            f2 = i
            continue
        if f2 == i - 1:
            if ('(' == f1) and ('?' == pattern_str[i]):
                f1 = '?'
                f2 = i
            elif ('?' == f1) and ('P' == pattern_str[i]):
                f1 = 'P'
                f2 = i
            elif ('P' == f1) and ('<' == pattern_str[i]):
                f3 = i + 1
        elif f3 != 0:
            if '>' == pattern_str[i]:
                #print pattern_str[f3:i]
                value_list.append(pattern_str[f3:i])
                f3 = 0
        else:
            f3 = 0
            f1 = ''
            f2 = 0
    return

###################################################################################################
#将一个字符串按照\n分为数组
def get_lines(txt):
	tmp = txt.replace("\r\n", "\n")
	return txt.split("\n")

###################################################################################################
#检查参数是否是一个数字
def is_integer(s):
	c = r'\d+'
	d = re.match(c, s)
	if None == d:
		return False, s
	c = r'\D+'
	if None != re.search(c, s):
		return False, s
	return True, string.atoi(s)

###################################################################################################
#将rule中的预定义正则表达式转换为真实的正则表达式
def get_regex_str(org, regex_list, code_str):
	s0 = org
	r1 = "((%s)-([a-zA-Z0-9]{1,15})-)" %code_str
	p1 = re.compile(r1)
	s1 = re.findall(r1, org)
	if None != s1:
		for s2 in s1:
			s3 = get_regex_by_code(regex_list, s2[1])
			s4 = "(?P<%s>(%s))" %(s2[2], s3)
			s0 = s0.replace(s2[0], s4)
	return s0

###################################################################################################
#根据code,从正则表达式列表中查询正则表达式内容
def get_regex_by_code(regex_list, code):
    for p in regex_list:
        if p[0] == code:
            return p[1]
    return ""

###################################################################################################
#执行主函数
###################################################################################################
if __name__ == "__main__":
	main()
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: