您的位置：首页 > 编程语言 > Python开发

python(2):使用python分析大日志文件思路及过程

2017-05-10 10:32 519 查看

1.做服务器开发的经常会遇到要分析大量的日志，统计大量数据；这里介绍几种统计日志数据的方法和思路

之前有遇到过要统计几天内的url出现次数的事情，一天有24个gz压缩文件，每个文件大概6G左右，URL的不重复率也很高

使用方法:

1.用shell 解压然后在统计，shell脚本写起来麻烦，统计那一块很多人也不是很熟悉(不也不咋熟悉)，sort又很慢，用下面的方法进行md5转换就更慢了

while read line

   do

     # 将字符串使用md5sum转换然后截取有用的部分

     m5=$(echo $line |md5sum | awk '{print $1}')

     echo $m5 >> ./1/$filename"_txt"

   done < ./$filename"_txt"

2.使用lua(这个因为是本人最熟悉的脚本，所以先考虑的这个),在分析小数据的时候还是挺快的，数据量大了之后可能会产生内存分配失败的异常，32位下最大是2G，64位下理论上是可以达到2的64次方的，但是只要内有多余的内存分配就是抛出内存异常；而且在windows和linux下都需要独立安装，公司的有些服务器不提供安装许可，所以就不能用了

3.第三种使用python就行分片处理，python的问题和上面的lua一样，32位下允许2G,64位下允许2的64次方；linux系统基本都自带的有；但是即使是64位下加载数据大了之后也是会各种问题；

问题: 1.空闲可用内存用完后python就会很卡着不动，或者抛出MemoryErr的异样,这个可以用try -catch处理，也可以分段处理

2. python在读取文件的时候readline如果遇到无法识别的结束符eof的时候会报错(在gzip的部分版本是会出现的)，可以使用try-catch处理或者更新gzip的库

4.第四种思路也是用python，就是将数据存入数据库就行处理，可以存入python自带的sqlite数据库(试过解析插入会比较慢)；也可以存入redis这样的存储数据库会比较快

下面提供第三种的切片代码:

切片的思路有两种:

第一种是横向切片:就是按照常规的逻辑，每个文件的url就行合并统计，然后再就行一层一层的文件合并；这个的问题是如果url的重复率很低的话，后面的合并文件会越来越大，最终都会出现memoryErr的情况，好处是当URL重复率较高的话扫描一遍文件就可能可并完毕。

第二种是纵向切片:就是按照数据就行分片，区url转换成md5之后的两个值就行取模，然后按照模的大小一遍一遍的扫描所有文件就行统计，可以根据分片的数据量调整模的大小，就不会出现内存不够的情况了，不足之处是要扫描好多次。

# -*- coding: utf-8 -*-

import os, sys

import hashlib

import gzip

import threading

from array import *

import gc

md = hashlib.md5()

# do file

class DoFile:

    # ??????

def __init__(self):

  self.LineList = {}

  self.max = 0

  self.all_num_array = [0,0,0,0,0,0,0,0,0,0,0]

  self.out_file_num = 1

  self.all_max = 0

  self.file_num = 0

  self.one_max = 0

  self.one_file_num = 0

#输出dic中的数据到文件

def putOut(self, file_name):

  print(file_name)

  dirfile_f = open(file_name, "w")

  for key, value in self.LineList.items() :

   str_1 = key+" "+str(value)+" \n"

   dirfile_f.write(str_1)

  dirfile_f.close()

  self.max = 0

  self.LineList.clear()

  gc.collect()

# 扫描统计,比较耗时

def getTongJi(self, fen_duan):

  all_sum = 0

  file_num = 0

  max_1 = 0

  max_1_file_num = 0

  for key, value in self.LineList.items():

   all_sum = all_sum + value

   file_num = file_num + 1

   if value > 1 :

    max_1 = max_1 + value

    max_1_file_num = max_1_file_num + 1

  print(fen_duan,all_sum,file_num,max_1,max_1_file_num)

  self.LineList.clear()

  self.max = 0

  gc.collect()

#输出统计内容

def getTongjiAll(self, num):

  print(num, self.all_max, self.file_num, self.one_max, self.one_file_num)

  self.all_max = 0

  self.file_num = 0

  self.one_max = 0

  self.one_file_num = 0

  self.LineList.clear()

  self.max = 0

  gc.collect()





def GetTowInfo(self, file_name) :

  f_handl = open(file_name, "r")

  while f_handl != None:

   line = f_handl.readline()

   if not line:

    break;

   line = line.replace("\n","")

line = line.replace("\r","")

   key, value = line.split(' ')

   #print(key, value)

   if self.LineList.get(key) == None:

    self.LineList[key] = int(value)

   else:

    self.LineList[key] = self.LineList[key] + int(value)

   self.max = self.max + 1

  f_handl.close()

  return self.max

#统计解析后的文件(k v格式)的数据

def GetModeInf(self,file_name, mode, mode_to):

  f_handl = open(file_name, "r")

  for line in f_handl.readlines():

   if not line:

    break;

   #line = line.replace("\n","")

   #line =line.replace("\r","")

   line = line.strip()

   #print(line)

   key, value= line.split(' ')

   if int(key[len(key)-2], 16) % mode == mode_to:

#print(key, value)

if self.LineList.get(key) == None:

self.LineList[key] = int(value)

      self.file_num = self.file_num + 1

      if int(value) > 1:

       self.one_max = self.one_max + int(value)

       self.one_file_num = self.one_file_num + 1

else:

      if self.LineList[key] == 1 :

self.one_max = self.one_max + int(value) + 1

       self.one_file_num = self.one_file_num + 1

      else :

       self.one_max = self.one_max + int(value)

self.LineList[key] = self.LineList[key] + int(value)

self.all_max = self.all_max + int(value)

f_handl.close()

return self.max

def SetOutFileNum(self,out_num):

  self.out_file_num = out_num

# 当符合输出条件时就将dic中的数据输出到文件

def MarkOutPutFile(self, max_num):

  if max_num > 10000000:

   out_file = "./1/uzip_txt_"+str(self.out_file_num)

   self.out_file_num = self.out_file_num + 1

   self.putOut(out_file)

#解析gz压缩文件,存入dic中

def GetZipFile(self, file_name):

  print("zip",file_name)

  f = gzip.open(file_name, 'rb')

  #for t_line in f.readlines():

  while 1:

   t_line = None

   try:

    t_line = f.readline()

    if not t_line:

     break

   except Exception, err:

    print(Exception, ":",err)

    return 30000000

   #print(t_line)

   t_value = t_line.split(' ')

   #print(t_value[6])

   line = t_value[6]

   #line = line.replace("\n","")

#line = line.replace("\r","")

#line = line.replace(" ","")

   line = line.strip()

md = hashlib.md5()

md.update(line)

key = md.hexdigest()

   mark = int(key[len(key)-2], 16)%10

   #self.all_num_array[mark] = self.all_num_array[mark]+1

   #print(key)

                        if self.LineList.get(key) == None:

                                self.LineList[key] = 1

     self.all_num_array[mark] = self.all_num_array[mark]+1

     self.max = self.max + 1

                        else:

                                self.LineList[key] = self.LineList[key] + 1

   #MarkOutPutFile(self.max,None)

  f.close()

  print("array_info:",self.all_num_array)

  return self.max

def GetCountNum(self):

  max_sum_all = 0

  max_one = 0

  for i in range(0, 10):

max_sum_all = max_sum_all + self.all_num_array[i]

max_one = max_one + self.all_num_array[i]/20000000

  print("splite",max_sum_all, max_one)

  return max_one

def GetCountFileNum(self):

  return self.out_file_num

dofile=DoFile()

#unzgrep file

def UnZgrepFile(out_file_num, zipfile_s, zipfile_e):

dofile.SetOutFileNum(out_file_num)

for i in range(zipfile_s, zipfile_e):

  str_k = "image_"+str(i)+ ".gz"

  if i < 10:

    str_k = "image_0"+str(i)+".gz"

  max_num = dofile.GetZipFile(str_k)

  dofile.MarkOutPutFile(max_num)

#count

def CountFile(c_s, c_max, file_num):

for j in range(c_s, c_max):

  print("count",c_max, j)

  for k in range(1, file_num):

   file_k = "./1/uzip_txt_"+str(k)

   print(file_k)

   dofile.GetModeInf(file_k, c_max, j)

  dofile.getTongjiAll(j)

if __name__ == '__main__' :

#unzgrep file

UnZgrepFile(1, 0, 24)

#get count num

split_num = dofile.GetCountNum()

out_file_num = dofile.GetCountFileNum()

#count

CountFile(0, split_num, out_file_num)

内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理

标签： python 日志分析 MemoryErr

相关文章推荐

新的分享

章节导航