您的位置:首页 > 编程语言 > Python开发

python数据预处理练习

2016-07-28 09:08 411 查看
#ecoding=utf-8
import math
import re
import csv

def fileREAD(fileURL,access):
"传入文件路径,返回存储文件内容的二维列表"
localArray = []  # 创建一个列表用于存储文件内容
csvfile = file(fileURL, access)
reader = csv.reader(csvfile)
for line in reader:
localArray.append(line)
csvfile.close()
return localArray

def getLine(inList,Line):
"获得某一行数据"
return inList[Line]

def getRow(inList,Row):
"获得某一列数据"
listReturn = []
for i in inList:
listReturn.append(i[Row])
return listReturn

def setLine(inList,childList,Line):
"设置矩阵某一行数据"
inList[Line] = childList

def setRow(inList,chikdList,Row):
"设置矩阵的某一列"
i = 0
for i in range(0,len(chikdList)):
inList[i][Row] = chikdList[i]

def addLine(inList,childLine):
"给数据矩阵添加一行"
inList.append(childLine)

def addRow(inList,childRow):
"给数据矩阵添加一列"
j = 0
for i in inList:
i.append(childRow[j])
j = j+1

def getAVG(inList):
"求数值属性的均值"
sumOfList = 0
lengOfList = 0
for i in inList:
if re.match(r'[0-9]+',i):
sumOfList = sumOfList + float(i)
lengOfList = lengOfList + 1
else:
continue
if lengOfList != 0 :
return sumOfList/lengOfList
else:
return "当前特征无平均值"

def getAVE(inList):
"求数值属性的方差"
#先求平均数
sumOfList = 0
lengOfList = 0
su = 0
for i in inList:
if re.match(r'[0-9]+', i):
sumOfList = sumOfList + float(i)
lengOfList = lengOfList + 1
else:
continue
if lengOfList != 0:
avg = sumOfList / lengOfList
for j in inList:
if re.match(r'[0-9]+',j):
su += (float(j) - avg) ** 2
else:
continue
return math.sqrt(su)
else:
return "当前特征无方差"

def average(seq, total=0.0):
num = 0
for item in seq:
total += item
num += 1
return total / num

def getQUANTILE(inList,inlocaltion):
"求数值属性的分位数"
if inlocaltion >1 or inlocaltion<0 or inlocaltion == 1:
return "输入的分位数数值错误"
localLst = []
leng = 0
for i in inList:
if re.match(r'[0-9]+',i):
localLst.append(float(i))
leng = leng + 1
else:
continue
if leng == 0:
return "当前特征不可求中位数"
localLst.sort()
if inlocaltion == 0.5:
if len(localLst)%2 == 1:
return localLst[len(localLst)//2]
else:
return (localLst[len(localLst)//2-1]+localLst[len(localLst)//2])/2.0
elif inlocaltion<1 and inlocaltion>=0:
return localLst[int(len(localLst)*inlocaltion)]

def fileREAD(fileURL,access):
"传入文件路径,返回存储文件内容的二维列表"
localArray = []  # 创建一个列表用于存储文件内容
csvfile = file(fileURL, access)
reader = csv.reader(csvfile)
for line in reader:
localArray.append(line)
csvfile.close()
return localArray

def removeNoiseAuto(inList):
"利用IRQ识别噪声数据并去除该数据"
Q3 = getQUANTILE(inList,0.75)
Q1 = getQUANTILE(inList,0.25)
IRQ = Q3 - Q1
for i in range(1,len(inList),1):
if float(inList[i]) - Q3 > 1.5*IRQ or Q1 - float(inList[i]) > 1.5*IRQ:
inList[i] = ''
return inList

def removeNoiseByThresholdMin(inList,inThresholdMin):
"根据最小阈值去除噪声数据去除该数据"
for i in range(1, len(inList), 1):
if float(inList[i]) < inThresholdMin:
inList[i] = ''
return inList

def removeNoiseByThresholdMax(inList,inThresholdMax):
"根据最大阈值去除噪声数据去除该数据"
for i in range(1, len(inList), 1):
if float(inList[i]) > inThresholdMax:
inList[i] = ''
return inList

def autoPaddingByAVG(inList):
"利用均值补全缺失值"
avg = getAVG(inList)
for i in range(1, len(inList), 1):
if inList[i] == '':
inList[i] = str(avg)
return inList

def autoPaddingByMedian(inList):
"利用中位数补全缺失值"
avg = getQUANTILE(inList,0.5)
for i in range(1, len(inList), 1):
if inList[i] == '':
inList[i] = str(avg)
return inList

def binningWidth(inList,width):
"数据离散化:等宽分箱"
dic = {}
for i in range(1,len(inList)):
dic[i] =float(inList[i])
dict = sorted(dic.iteritems(), key=lambda d: d[1], reverse= False)  # 先将列表按value排序
dictList = []  # 将排序后元素赋值给一个列表,用于存储K-V对
for varlo in dict:
dictList.append(list(varlo))
i = 0  # 用于记录每个箱开始位置
j = 0  #用于记录每个箱结束位置
innerList = []
for i in range(0, len(dictList)):
if dictList[i][1] - dictList[j][1] > width:
avg = average(innerList)
for k in range(j, i, 1):
dictList[k][1] = avg
innerList = []
j = i
innerList.append(dictList[i][1])
if (i == len(dictList)-1):
avg = average(innerList)
for k in range(j, i, 1):
dictList[k][1] = avg
innerList = []
dictList[i][1] = avg

dic1 = {}
for i in range(0, len(dictList)):
dic1[dictList[i][0]] = dictList[i][1]
ad = sorted(dic1.iteritems(), key=lambda d: d[0], reverse=False)  # 先将列表按KEY排序
for i in range(0, len(ad)):
inList[i + 1] = ad[i][1]
return inList

def binningDeep(inList,deep1):
"数据离散化:等频分箱"
deep = deep1 -1
dic = {}
for i in range(1,len(inList)):
dic[i] =float(inList[i])
dict = sorted(dic.iteritems(), key=lambda d: d[1], reverse= False)  # 先将列表按value排序
dictList = []  # 将排序后元素赋值给一个列表,用于存储K-V对
for varlo in dict:
dictList.append(list(varlo))
innerList = []
for i in range(0,deep):  #为了排除0的干扰,首先处理掉deep个元素
innerList.append(dictList[i][1])
for i in range(deep, len(dictList)):
if i % deep == 0:
avg = average(innerList)
for j in range(i-deep,i):
dictList[j][1] = avg
innerList = []
innerList.append(dictList[i][1])
if i == len(dictList)-1:
avg = average(innerList)
for j in range((i+1)/deep*deep,i+1):
dictList[j][1] = avg

dic1 = {}
for i in range(0, len(dictList)):
dic1[dictList[i][0]] = dictList[i][1]
ad = sorted(dic1.iteritems(), key=lambda d: d[0], reverse= False)  # 先将列表按KEY排序
for i in range(0,len(ad)):
inList[i+1] = ad[i][1]
return inList

def oneHot(inList,Row):
"对输入数据矩阵的某一列使用oneHot编码"
rowList0 = getRow(inList,Row)
rowHead = rowList0[0]
rowList = []
for i in range(1,len(rowList0)):
rowList.append(rowList0[i])
rowmsg = {}
j = 0
for i in rowList:
if rowmsg.has_key(i):
rowmsg[i] = rowmsg[i] + 1
else:
rowmsg[i] = 1
for i in rowmsg.keys():
addList = []
addList.append(i)
for j in rowList:
if j == i:
addList.append('1')
else:
addList.append('0')
addRow(inList,addList)
for i in inList:
print i

def  minMax(inList):
"最大最小归一化"
innerList = []
for i in range(1,len(inList)):
if re.match(r'[0-9]+', inList[i]):
innerList.append(float(inList[i]))
maxvalue = max(innerList)
minvalue = min(innerList)

for i in range(1,len(inList)):
if re.match(r'[0-9]+', inList[i]):
a = (float(inList[i])-minvalue)/(maxvalue - minvalue)
b = "%.4f" %a
inList[i] = str(b)
return inList

def  zScore(inList):
"zScore归一化"
print inList
u = getAVG(inList)
ave = getAVE(inList)
stand = math.sqrt(ave)
for i in range(1,len(inList)):
if re.match(r'[0-9]+', inList[i]):
a = (float(inList[i])-u)/stand
b = "%.4f" % a
inList[i] = str(b)
return inList

def similarityDistance(inList1,inList2,n):
"距离相似度"
sum = 0
for i in range(1,len(inList1)):
sum = sum + abs(float(inList1[i])-float(inList2[i])) ** n
a = float(1)/2
return pow(sum,a)

def similaritySim(inList1,inList2):
"余弦相似度计算"
sum = 0
for i in range(1,len(inList1)):
sum = sum + float(inList1[i])*float(inList2[i])
sum1 = 0
sum2 = 0
for i in range(1,len(inList1)):
sum1 = sum1 + float(inList1[i])**2
for i in range(1, len(inList2)):
sum2 = sum2 + float(inList2[i]) ** 2

return sum/(math.sqrt(sum1)*math.sqrt(sum2))

fileInput = fileREAD("D:\\PythonWorkSpace\\ExternalFile\\train.csv","r")

# #获得某一行数据
# print getLine(fileInput,1)
#
# #获得某一列数据
# print getRow(fileInput,0)

# #设置某一行数据
# print "设置前:"
# print getLine(fileInput,1)
# setLine(fileInput,getLine(fileInput,2),1)
# print "设置后:"
# print getLine(fileInput,1)

# #设置某一列数据
# print "设置前:"
# print getRow(fileInput,1)
# setRow(fileInput,getRow(fileInput,2),1)
# print "设置后:"
# print getRow(fileInput,1)

# #均值
# print getAVG(getRow(fileInput,9))

# #方差
# print getAVE(getRow(fileInput,9))

# #分位数
# print getQUANTILE(getRow(fileInput,9),0.5)

# #噪声数据过滤1
# print removeNoiseAuto(getRow(fileInput,1))
#
# #噪声数据过滤2
# print removeNoiseByThresholdMin(getRow(fileInput,0),10)
#
# #噪声数据过滤3
# print removeNoiseByThresholdMax(getRow(fileInput,0),10)

# #缺失值补全1
# print autoPaddingByAVG(getRow(fileInput,0))
#
# #缺失值补全2
# print autoPaddingByMedian(getRow(fileInput,0))

# #等宽分箱
# print binningWidth(getRow(fileInput,0),3)
#
# #等频分箱
# print binningDeep(getRow(fileInput,0),3)

# #ONE-HOT编码
# oneHot(fileInput,1)
# for i in fileInput:
#     print i

# #最大最小归一化
# print minMax(getRow(fileInput,0))
#
# #zScore归一化
# print zScore(getRow(fileInput,0))

# #距离相似度
# print similarityDistance(getRow(fileInput,0),getRow(fileInput,0),2)

# # 余弦相似度计算
# print similaritySim(getRow(fileInput,0),getRow(fileInput,1))
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  python 数据