python数据预处理练习
2016-07-28 09:08
411 查看
#ecoding=utf-8 import math import re import csv def fileREAD(fileURL,access): "传入文件路径,返回存储文件内容的二维列表" localArray = [] # 创建一个列表用于存储文件内容 csvfile = file(fileURL, access) reader = csv.reader(csvfile) for line in reader: localArray.append(line) csvfile.close() return localArray def getLine(inList,Line): "获得某一行数据" return inList[Line] def getRow(inList,Row): "获得某一列数据" listReturn = [] for i in inList: listReturn.append(i[Row]) return listReturn def setLine(inList,childList,Line): "设置矩阵某一行数据" inList[Line] = childList def setRow(inList,chikdList,Row): "设置矩阵的某一列" i = 0 for i in range(0,len(chikdList)): inList[i][Row] = chikdList[i] def addLine(inList,childLine): "给数据矩阵添加一行" inList.append(childLine) def addRow(inList,childRow): "给数据矩阵添加一列" j = 0 for i in inList: i.append(childRow[j]) j = j+1 def getAVG(inList): "求数值属性的均值" sumOfList = 0 lengOfList = 0 for i in inList: if re.match(r'[0-9]+',i): sumOfList = sumOfList + float(i) lengOfList = lengOfList + 1 else: continue if lengOfList != 0 : return sumOfList/lengOfList else: return "当前特征无平均值" def getAVE(inList): "求数值属性的方差" #先求平均数 sumOfList = 0 lengOfList = 0 su = 0 for i in inList: if re.match(r'[0-9]+', i): sumOfList = sumOfList + float(i) lengOfList = lengOfList + 1 else: continue if lengOfList != 0: avg = sumOfList / lengOfList for j in inList: if re.match(r'[0-9]+',j): su += (float(j) - avg) ** 2 else: continue return math.sqrt(su) else: return "当前特征无方差" def average(seq, total=0.0): num = 0 for item in seq: total += item num += 1 return total / num def getQUANTILE(inList,inlocaltion): "求数值属性的分位数" if inlocaltion >1 or inlocaltion<0 or inlocaltion == 1: return "输入的分位数数值错误" localLst = [] leng = 0 for i in inList: if re.match(r'[0-9]+',i): localLst.append(float(i)) leng = leng + 1 else: continue if leng == 0: return "当前特征不可求中位数" localLst.sort() if inlocaltion == 0.5: if len(localLst)%2 == 1: return localLst[len(localLst)//2] else: return (localLst[len(localLst)//2-1]+localLst[len(localLst)//2])/2.0 elif inlocaltion<1 and inlocaltion>=0: return localLst[int(len(localLst)*inlocaltion)] def fileREAD(fileURL,access): "传入文件路径,返回存储文件内容的二维列表" localArray = [] # 创建一个列表用于存储文件内容 csvfile = file(fileURL, access) reader = csv.reader(csvfile) for line in reader: localArray.append(line) csvfile.close() return localArray def removeNoiseAuto(inList): "利用IRQ识别噪声数据并去除该数据" Q3 = getQUANTILE(inList,0.75) Q1 = getQUANTILE(inList,0.25) IRQ = Q3 - Q1 for i in range(1,len(inList),1): if float(inList[i]) - Q3 > 1.5*IRQ or Q1 - float(inList[i]) > 1.5*IRQ: inList[i] = '' return inList def removeNoiseByThresholdMin(inList,inThresholdMin): "根据最小阈值去除噪声数据去除该数据" for i in range(1, len(inList), 1): if float(inList[i]) < inThresholdMin: inList[i] = '' return inList def removeNoiseByThresholdMax(inList,inThresholdMax): "根据最大阈值去除噪声数据去除该数据" for i in range(1, len(inList), 1): if float(inList[i]) > inThresholdMax: inList[i] = '' return inList def autoPaddingByAVG(inList): "利用均值补全缺失值" avg = getAVG(inList) for i in range(1, len(inList), 1): if inList[i] == '': inList[i] = str(avg) return inList def autoPaddingByMedian(inList): "利用中位数补全缺失值" avg = getQUANTILE(inList,0.5) for i in range(1, len(inList), 1): if inList[i] == '': inList[i] = str(avg) return inList def binningWidth(inList,width): "数据离散化:等宽分箱" dic = {} for i in range(1,len(inList)): dic[i] =float(inList[i]) dict = sorted(dic.iteritems(), key=lambda d: d[1], reverse= False) # 先将列表按value排序 dictList = [] # 将排序后元素赋值给一个列表,用于存储K-V对 for varlo in dict: dictList.append(list(varlo)) i = 0 # 用于记录每个箱开始位置 j = 0 #用于记录每个箱结束位置 innerList = [] for i in range(0, len(dictList)): if dictList[i][1] - dictList[j][1] > width: avg = average(innerList) for k in range(j, i, 1): dictList[k][1] = avg innerList = [] j = i innerList.append(dictList[i][1]) if (i == len(dictList)-1): avg = average(innerList) for k in range(j, i, 1): dictList[k][1] = avg innerList = [] dictList[i][1] = avg dic1 = {} for i in range(0, len(dictList)): dic1[dictList[i][0]] = dictList[i][1] ad = sorted(dic1.iteritems(), key=lambda d: d[0], reverse=False) # 先将列表按KEY排序 for i in range(0, len(ad)): inList[i + 1] = ad[i][1] return inList def binningDeep(inList,deep1): "数据离散化:等频分箱" deep = deep1 -1 dic = {} for i in range(1,len(inList)): dic[i] =float(inList[i]) dict = sorted(dic.iteritems(), key=lambda d: d[1], reverse= False) # 先将列表按value排序 dictList = [] # 将排序后元素赋值给一个列表,用于存储K-V对 for varlo in dict: dictList.append(list(varlo)) innerList = [] for i in range(0,deep): #为了排除0的干扰,首先处理掉deep个元素 innerList.append(dictList[i][1]) for i in range(deep, len(dictList)): if i % deep == 0: avg = average(innerList) for j in range(i-deep,i): dictList[j][1] = avg innerList = [] innerList.append(dictList[i][1]) if i == len(dictList)-1: avg = average(innerList) for j in range((i+1)/deep*deep,i+1): dictList[j][1] = avg dic1 = {} for i in range(0, len(dictList)): dic1[dictList[i][0]] = dictList[i][1] ad = sorted(dic1.iteritems(), key=lambda d: d[0], reverse= False) # 先将列表按KEY排序 for i in range(0,len(ad)): inList[i+1] = ad[i][1] return inList def oneHot(inList,Row): "对输入数据矩阵的某一列使用oneHot编码" rowList0 = getRow(inList,Row) rowHead = rowList0[0] rowList = [] for i in range(1,len(rowList0)): rowList.append(rowList0[i]) rowmsg = {} j = 0 for i in rowList: if rowmsg.has_key(i): rowmsg[i] = rowmsg[i] + 1 else: rowmsg[i] = 1 for i in rowmsg.keys(): addList = [] addList.append(i) for j in rowList: if j == i: addList.append('1') else: addList.append('0') addRow(inList,addList) for i in inList: print i def minMax(inList): "最大最小归一化" innerList = [] for i in range(1,len(inList)): if re.match(r'[0-9]+', inList[i]): innerList.append(float(inList[i])) maxvalue = max(innerList) minvalue = min(innerList) for i in range(1,len(inList)): if re.match(r'[0-9]+', inList[i]): a = (float(inList[i])-minvalue)/(maxvalue - minvalue) b = "%.4f" %a inList[i] = str(b) return inList def zScore(inList): "zScore归一化" print inList u = getAVG(inList) ave = getAVE(inList) stand = math.sqrt(ave) for i in range(1,len(inList)): if re.match(r'[0-9]+', inList[i]): a = (float(inList[i])-u)/stand b = "%.4f" % a inList[i] = str(b) return inList def similarityDistance(inList1,inList2,n): "距离相似度" sum = 0 for i in range(1,len(inList1)): sum = sum + abs(float(inList1[i])-float(inList2[i])) ** n a = float(1)/2 return pow(sum,a) def similaritySim(inList1,inList2): "余弦相似度计算" sum = 0 for i in range(1,len(inList1)): sum = sum + float(inList1[i])*float(inList2[i]) sum1 = 0 sum2 = 0 for i in range(1,len(inList1)): sum1 = sum1 + float(inList1[i])**2 for i in range(1, len(inList2)): sum2 = sum2 + float(inList2[i]) ** 2 return sum/(math.sqrt(sum1)*math.sqrt(sum2)) fileInput = fileREAD("D:\\PythonWorkSpace\\ExternalFile\\train.csv","r") # #获得某一行数据 # print getLine(fileInput,1) # # #获得某一列数据 # print getRow(fileInput,0) # #设置某一行数据 # print "设置前:" # print getLine(fileInput,1) # setLine(fileInput,getLine(fileInput,2),1) # print "设置后:" # print getLine(fileInput,1) # #设置某一列数据 # print "设置前:" # print getRow(fileInput,1) # setRow(fileInput,getRow(fileInput,2),1) # print "设置后:" # print getRow(fileInput,1) # #均值 # print getAVG(getRow(fileInput,9)) # #方差 # print getAVE(getRow(fileInput,9)) # #分位数 # print getQUANTILE(getRow(fileInput,9),0.5) # #噪声数据过滤1 # print removeNoiseAuto(getRow(fileInput,1)) # # #噪声数据过滤2 # print removeNoiseByThresholdMin(getRow(fileInput,0),10) # # #噪声数据过滤3 # print removeNoiseByThresholdMax(getRow(fileInput,0),10) # #缺失值补全1 # print autoPaddingByAVG(getRow(fileInput,0)) # # #缺失值补全2 # print autoPaddingByMedian(getRow(fileInput,0)) # #等宽分箱 # print binningWidth(getRow(fileInput,0),3) # # #等频分箱 # print binningDeep(getRow(fileInput,0),3) # #ONE-HOT编码 # oneHot(fileInput,1) # for i in fileInput: # print i # #最大最小归一化 # print minMax(getRow(fileInput,0)) # # #zScore归一化 # print zScore(getRow(fileInput,0)) # #距离相似度 # print similarityDistance(getRow(fileInput,0),getRow(fileInput,0),2) # # 余弦相似度计算 # print similaritySim(getRow(fileInput,0),getRow(fileInput,1))
相关文章推荐
- Python动态类型的学习---引用的理解
- Python3写爬虫(四)多线程实现数据爬取
- 垃圾邮件过滤器 python简单实现
- 下载并遍历 names.txt 文件,输出长度最长的回文人名。
- install and upgrade scrapy
- Scrapy的架构介绍
- Centos6 编译安装Python
- 使用Python生成Excel格式的图片
- 让Python文件也可以当bat文件运行
- [Python]推算数独
- Python中zip()函数用法举例
- Python中map()函数浅析
- 我是运营,我没有假期
- Python将excel导入到mysql中
- Python在CAM软件Genesis2000中的应用
- 使用Shiboken为C++和Qt库创建Python绑定
- FREEBASIC 编译可被python调用的dll函数示例