您的位置:首页 > 编程语言 > Python开发

python实现指定目录下批量文件的单词计数:串行版本

2014-09-30 23:50 871 查看
直接上代码。

练习目标:

1. 使用 Python 面向对象的方法封装逻辑和表达 ;

2. 使用异常处理和日志API ;

3. 使用文件目录读写API ;

4. 使用 list, map, tuple 三种数据结构 ;

5. lambda 、正则使用及其它。

下一篇将实现并发版本。

#-------------------------------------------------------------------------------
# Name:        wordstat_serial.py
# Purpose:     statistic words in java files of given directory by serial
#
# Author:      qin.shuq
#
# Created:     08/10/2014
# Copyright:   (c) qin.shuq 2014
# Licence:     <your licence>
#-------------------------------------------------------------------------------

import re
import os
import time
import logging

LOG_LEVELS = {
'DEBUG': logging.DEBUG, 'INFO': logging.INFO,
'WARN': logging.WARNING, 'ERROR': logging.ERROR,
'CRITICAL': logging.CRITICAL
}

def initlog(filename) :

logger = logging.getLogger()
hdlr = logging.FileHandler(filename)
formatter = logging.Formatter("%(asctime)s %(levelname)s %(message)s")
hdlr.setFormatter(formatter)
logger.addHandler(hdlr)
logger.setLevel(LOG_LEVELS['INFO'])

return logger

errlog = initlog("error.log")
infolog = initlog("info.log")

class WordReading(object):

def __init__(self, fileList):
self.fileList = fileList

def readFileInternal(self, filename):
lines = []
try:
f = open(filename, 'r')
lines = f.readlines()
infolog.info('[successful read file %s]\n' % filename)
f.close()
except IOError, err:
errorInfo = 'file %s Not found \n' % filename
errlog.error(errorInfo)
return lines

def readFile(self):
allLines = []
for filename in self.fileList:
allLines.extend(self.readFileInternal(filename))
return allLines

class WordAnalyzing(object):
'''
return Map<Word, count>  the occurrence times of each word
'''
wordRegex = re.compile("[\w]+")
def __init__(self, allLines):
self.allLines = allLines

def analyze(self):
result = {}
lineContent = ''.join(self.allLines)
matches = WordAnalyzing.wordRegex.findall(lineContent)
if matches:
for word in matches:
if result.get(word) is None:
result[word] = 0
result[word] += 1
return result

class FileObtainer(object):

def __init__(self, dirpath, fileFilterFunc=None):
self.dirpath = dirpath
self.fileFilterFunc = fileFilterFunc

def findAllFilesInDir(self):
files = []
for path, dirs, filenames in os.walk(self.dirpath):
if len(filenames) > 0:
for filename in filenames:
files.append(path+'/'+filename)

if self.fileFilterFunc is None:
return files
else:
return filter(self.fileFilterFunc, files)

class PostProcessing(object):

def __init__(self, resultMap):
self.resultMap = resultMap

def sortByValue(self):
return sorted(self.resultMap.items(),key=lambda e:e[1], reverse=True)

def obtainTopN(self, topN):
sortedResult = self.sortByValue()
sortedNum = len(sortedResult)
topN = sortedNum if topN > sortedNum else topN
for i in range(topN):
topi = sortedResult[i]
print topi[0], ' counts: ', topi[1]

if __name__ == "__main__":

dirpath = "c:\\Users\\qin.shuq\\Desktop\\region_master\\src"

starttime = time.time()
fileObtainer = FileObtainer(dirpath, lambda f: f.endswith('.java'))
fileList = fileObtainer.findAllFilesInDir()
endtime = time.time()
print 'ObtainFile cost: ', (endtime-starttime)*1000 , 'ms'

starttime = time.time()
wr = WordReading(fileList)
allLines = wr.readFile()
endtime = time.time()
print 'WordReading cost: ', (endtime-starttime)*1000 , 'ms'

starttime = time.time()
wa = WordAnalyzing(allLines)
resultMap = wa.analyze()
endtime = time.time()
print 'WordAnalyzing cost: ', (endtime-starttime)*1000 , 'ms'

starttime = time.time()
postproc = PostProcessing(resultMap)
postproc.obtainTopN(30)
endtime = time.time()
print 'PostProcessing cost: ', (endtime-starttime)*1000 , 'ms'



                                            
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: