您的位置:首页 > 运维架构 > Nginx

Nginx服务整理 日志分析(shell+python)

2017-03-24 13:36 946 查看
python脚本Goaccess---良心nginx日志分析工具Ngxtop-Nginx日志实时分析利器goaccess-nginx日志分析工具Apache_Nginx 访问日志分析脚本 - 雷纳科斯的博客 2013awk分析nginx日志里面的接口响应时间log_format main ‘$remote_addr – $remote_user [$time_iso8601] “$request” ‘‘$status $body_bytes_sent “$http_referer” ‘‘”$http_user_agent” “$http_x_forwarded_for” ‘‘ “$upstream_addr” “$upstream_status” “$request_time"`;cat website.access.log| awk ‘{print $(NF)}’ | awk -F “\”” ‘{print $2′}>a.txtpaste -d ” ” website.access.log a.txt > b.txtcat b.txt |awk ‘($NF>1){print $6$7 ” ” $NF}’>c.txtlinux下使用awk,wc,sort,uniq,grep对nginx日志进行分析和统计b). 字段含义(如下说明)column1:ip_addresscolumn2:log_timecolumn3:requestcolumn4:status_codecolumn5:send_bytescolumn6:referer需求一:统计总记录数,总成功数,各种失败数:404,403,500cat data.log|awk -F '\t' '{if($4 > 0) print $4}'|wc -l|awk '{print "Total Items:"$1}'2. 提取成功、各种失败总数cat data.log|awk -F '\t' '{if($4>0 && $4==200) print $4}'|wc -l需求二:各种错误中,哪类URL出现的次数最多,要求剔除重复项,并倒叙给出结果cat data.log|awk -F '\t' '{if($4>0 && $4==500) print $3}'|awk '{print $2}'|sort|uniq -c|sort -k1 nr需求三:要统计URL中文件名出现的次数,结果中要包含Code 和 Referer。但是 URL和 Referer中都包含 / 字符,对于过滤有干扰,尝试去解决。cat data.log|awk '{print $5,$7,$9}'|grep 200|sed 's#.*/\(.*\)#\1#'|sort -k1|uniq -cwc -l access.log |awk '{print $1}' 总请求数awk '{print $1}' access.log|sort |uniq |wc -l 独立IP数awk -F'[ []' '{print $5}' access.log|sort|uniq -c|sort -rn|head -5 每秒客户端请求数 TOP5awk '{print $1}' access.log|sort |uniq -c | sort -rn |head -5 访问最频繁IP Top5awk '{print $7}' access.log|sort |uniq -c | sort -rn |head -5 访问最频繁的URL TOP5awk '{if ($12 > 10){print $7}}' access.log|sort|uniq -c|sort -rn |head -5响应大于10秒的URL TOP5awk '{if ($13 != 200){print $13}}' access.log|sort|uniq -c|sort -rn|head -5分析请求数大于50000的源IP的行为awk '{print $1}' access.log|sort |uniq -c |sort -rn|awk '{if ($1 > 50000){print $2}}' > tmp.txtfor i in $(cat tmp.txt)doecho $i >> analysis.txtecho "访问行为统计" >> analysis.txtgrep $i access.log|awk '{print $6}' |sort |uniq -c | sort -rn |head -5 >> analysis.txtecho "访问接口统计" >> analysis.txtgrep $i access.log|awk '{print $7}' |sort |uniq -c | sort -rn |head -5 >> analysis.txtecho -e "\n" >> /root/analysis/$Ydate.txtdone如果源IP来自代理服务器,应将第一条命令过滤地址改为$http_x_forwarded_for地址awk '{print $NF}' access.log|sort |uniq -c |sort -rn|awk '{if ($1 > 50000){print $2}}' > tmp.txt5.性能指标并发连接数客户端向服务器发起请求,并建立了TCP连接。每秒钟服务器链接的总TCP数量,就是并发连接数PV(page view) UV(unique visitor) 独立IP
案例一
ip - - [23/Mar/2017:00:17:49 +0800] "GET / HTTP/1.1" 302 0 "-" "PycURL/7.19.7"

log_format access '$HTTP_X_REAL_IP - $remote_user [$time_local] "$request"'
'$status $body_bytes_sent "$http_referer" '
'"$http_user_agent" $HTTP_X_Forwarded_For';

192.168.21.1 - - [27/Jan/2014:11:28:53 +0800] "GET /2.php HTTP/1.1" 200 133 "-" "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1707.0 Safari/537.36" "-"192.168.21.128 200 127.0.0.1:9000 0.119 0.119

#log_format  main  '$remote_addr - $remote_user [$time_local] "$request" '
#                  '$status $body_bytes_sent "$http_referer" '
#                  '"$http_user_agent" "$http_x_forwarded_for"';

$http_host:用户在浏览器中输入的URL(IP或着域名)地址  192.168.21.128
$upstream_status: upstream状态    200
$upstream_addr: 后端upstream地址及端口  127.0.0.1:9000
$request_time: 页面访问总时间  0.119
$upstream_response_time:页面访问中upstream响应时间   0.119

$10 $body_bytes_sent
$1  $remote_addr
$7  $request
$11 $http_referer
$9  $status
$6  http_user_agent

1、总访问量
2、总带宽
3、独立访客量
4、访问IP统计
5、访问url统计
6、来源统计
7、404统计
8、搜索引擎访问统计(谷歌,百度)
9、搜索引擎来源统计(谷歌,百度)

#!/bin/bash
log_path=/home/www.centos.bz/log/access.log.1
domain="centos.bz"
email="log@centos.bz"
maketime=`date +%Y-%m-%d" "%H":"%M`
logdate=`date -d "yesterday" +%Y-%m-%d`
total_visit=`wc -l ${log_path} | awk '{print $1}'`
total_bandwidth=`awk -v total=0 '{total+=$10}END{print total/1024/1024}' ${log_path}`
total_unique=`awk '{ip[$1]++}END{print asort(ip)}' ${log_path}`
ip_pv=`awk '{ip[$1]++}END{for (k in ip){print ip[k],k}}' ${log_path} | sort -rn | head -20`
url_num=`awk '{url[$7]++}END{for (k in url){print url[k],k}}' ${log_path} | sort -rn | head -20`
referer=`awk -v domain=$domain '$11 !~
/http:\/\/[^/]*'"$domain"'/{url[$11]++}END{for (k in url){print
url[k],k}}' ${log_path} | sort -rn | head -20`
notfound=`awk '$9 == 404 {url[$7]++}END{for (k in url){print url[k],k}}' ${log_path} | sort -rn | head -20`
spider=`awk -F'"' '$6 ~ /Baiduspider/ {spider["baiduspider"]++} $6 ~
/Googlebot/ {spider["googlebot"]++}END{for (k in spider){print
k,spider[k]}}'  ${log_path}`
search=`awk -F'"' '$4 ~ /http:\/\/www\.baidu\.com/
{search["baidu_search"]++} $4 ~ /http:\/\/www\.google\.com/
{search["google_search"]++}END{for (k in search){print k,search[k]}}'
${log_path}`
#echo -e "概况\n报告生成时间:${maketime}\n总访问量:${total_visit}\n总带宽:${total_bandwidth}M\n独
立访客:${total_unique}\n\n访问IP统计\n${ip_pv}\n\n访问url统计\n${url_num}\n\n来源页面统计
\n${referer}\n\n404统计\n${notfound}\n\n蜘蛛统计\n${spider}\n\n搜索引擎来源统计
\n${search}" | mail -s "$domain $logdate log statistics" ${email}
案例二
# tar zxvf pymongo-1.11.tar.gz
# cd pymongo-1.11
# python setup.py install
python连接mongodb样例
$ cat conn_mongodb.py
#!/usr/bin/python

import pymongo
import random

conn = pymongo.Connection("127.0.0.1",27017)
db = conn.tage #连接库
db.authenticate("tage","123")
#用户认证
db.user.drop()
#删除集合user
db.user.save({'id':1,'name':'kaka','sex':'male'})
#插入一个数据
for id in range(2,10):
name = random.choice(['steve','koby','owen','tody','rony'])
sex = random.choice(['male','female'])
db.user.insert({'id':id,'name':name,'sex':sex})
#通过循环插入一组数据
content = db.user.find()
#打印所有数据
for i in content:
print i

编写python脚本
#encoding=utf8

import re

zuidaima_nginx_log_path="/usr/local/nginx/logs/www.zuidaima.com.access.log"
pattern = re.compile(r'^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}')

def stat_ip_views(log_path):
ret={}
f = open(log_path, "r")
for line in f:
match = pattern.match(line)
if match:
ip=match.group(0)
if ip in ret:
views=ret[ip]
else:
views=0
views=views+1
ret[ip]=views
return ret
def run():
ip_views=stat_ip_views(zuidaima_nginx_log_path)
max_ip_view={}
for ip in ip_views:
views=ip_views[ip]
if len(max_ip_view)==0:
max_ip_view[ip]=views
else:
_ip=max_ip_view.keys()[0]
_views=max_ip_view[_ip]
if views>_views:
max_ip_view[ip]=views
max_ip_view.pop(_ip)

print "ip:", ip, ",views:", views
#总共有多少ip
print "total:", len(ip_views)
#最大访问的ip
print "max_ip_view:", max_ip_view

run()
案例三

import os,re,sys,datetime
dlog="C:\Users\user\Desktop\\data.log"
iplist=[]
dict={}
with open(dlog) as f:
for i in f.readlines():
ip=i.split()[0].strip()
iplist.append(ip)
print iplist
myset = set(iplist)
# print myset
for item in myset:
# print("the %s has found %d" % (item, iplist.count(item)))
dict[item] = iplist.count(item)
dictr= sorted(dict.items(),key=lambda item:item[1],reverse=True)
print dictr
for i in dictr:
print i[0],i[1]

# tu= dictr[0]
# print tu[1]  统计访问次数最高的ip

or 可以这样写
dict[ip] = dict.get(ip, 0) + 1

                                            
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  nginx统计日志