1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
| import re, csv, os, glob, gzip from user_agents import parse from datetime import datetime, timedelta
LOG_DIR = '/var/log/nginx' LOG_PAT = 'access_geo.log*' OUT_DIR = os.path.expanduser('~/nginx_geo_daily')
day_str = (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d') out_csv = os.path.join(OUT_DIR, f'{day_str}.csv')
line_re = re.compile( r'(?P<ip>\d+\.\d+\.\d+\.\d+) .*?\[(?P<dt>[^\]]+)\] ' r'".*?" \d+ \d+ ".*?" "(?P<ua>[^"]+)" ' r'Country=(?P<country>[^ ]+) Region=(?P<region>[^ ]+) City=(?P<city>[^ ]+)' )
rows = [] for path in glob.glob(os.path.join(LOG_DIR, LOG_PAT)): opener = gzip.open if path.endswith('.gz') else open with opener(path, 'rt', encoding='utf-8', errors='ignore') as f: for line in f: m = line_re.search(line) if not m: continue ua = parse(m.group('ua')) rows.append({ 'datetime' : m.group('dt'), 'ip' : m.group('ip'), 'device' : ua.device.family, 'os' : ua.os.family, 'browser' : ua.browser.family, 'country' : m.group('country'), 'region' : m.group('region'), 'city' : m.group('city'), })
rows = sorted({(r['ip'], r['datetime']): r for r in rows}.values(), key=lambda r: r['datetime'])
os.makedirs(OUT_DIR, exist_ok=True) with open(out_csv, 'w', newline='', encoding='utf-8') as f: writer = csv.DictWriter(f, fieldnames=['datetime','ip','device','os','browser','country','region','city']) writer.writeheader() writer.writerows(rows)
print(f'{len(rows)} records written to {out_csv}')
|