一个apache log分析工具

帮朋友分析VPS性能时写的小东西。没有“产品化”,不过我觉得基本可用了,可以释放出来。

有需求的话,可以按需稍加修改,获取想要的信息,:-)


#!/usr/bin/env python

import re
import sys


g_ip_record={}
g_time_record={}

def analyze_by_ip():
	print 'number of IPs : %d' % len(g_ip_record)
	


def analyze_by_time():
	keys = g_time_record.keys()
	keys.sort()
	for hour_min in keys:
		print '%s : %d' % (hour_min , g_time_record[hour_min])

def analyze():
	analyze_by_time()
	analyze_by_ip()
	

def get_hour_min(time):
	#pattern = re.compile('.+/May/2011:(\d+:\d+:)\d+.+')
	#pattern = re.compile('.+/Jun/2011:(\d+:\d+:)\d+.+')
	#pattern = re.compile('.+/.+/.+:(\d+:\d+:)\d+.+')
	pattern = re.compile('24/Jun/2011:(\d+:\d+:)\d+.+')
	match = pattern.match(time)
	if match:
		return match.group(1)
	else:
		#print 'unmatched time string: %s' % time
		return ''


class access_record:
	def __init__(self, time, browser):
		self.time = time
		self.browser = browser

def process_access_record(ip, time, browser):
	#print 'ip=%s, time=%s, browser=%s' % (ip, time, browser)
	access_rec = access_record(time, browser)
	try:
		g_ip_record[ip].append(access_rec)
	except KeyError:
		g_ip_record[ip] = [access_rec]

	hour_min = get_hour_min(time)
	
	#print 'hour=%d' % hour
	if hour_min:
		try :
			g_time_record[hour_min] += 1
		except KeyError:
			g_time_record[hour_min] = 1


def set_parse_pattern():
	parse_pattern = '/dragonsight_service/compass_lite_ad.xml'
	#pattern = re.compile('(^\d+\.\d+\.\d+\.\d+).+\[(.+)\].+compass_lite_update.xml.+\"(.+)\"$')
	pattern = re.compile('(^\d+\.\d+\.\d+\.\d+).+\[(.+)\].+\"(.+)\"$')
	return pattern


def parse_log_line(line_data, pattern):
	ip = None
	time = None
	browser = None
	match = pattern.match(line_data)
	if match:
		ip = match.group(1)
		time = match.group(2)
		browser = match.group(3)

	return (ip, time, browser)

def main():
	if len(sys.argv) != 2:
		print 'Usage: parse_apache_log.py <log_file>'
		exit (-1)

	log_pattern = set_parse_pattern()
	file_name = sys.argv[1]
	log_f = open (file_name, "r")

	while True:
		line_data = log_f.readline()
		if not line_data :
			break;
		ip, time, browser = parse_log_line(line_data, log_pattern)
		if ip and time and browser:
			process_access_record(ip, time, browser)

	log_f.close()

	analyze()

if __name__ == '__main__':
	main ()



发表评论

电子邮件地址不会被公开。 必填项已用*标注