#!/usr/bin/python #emacs: -*- mode: python-mode; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*- #ex: set sts=4 ts=4 sw=4 noet: #------------------------- =+- Python script -+= ------------------------- """ @file postprocdata.py @date Tue May 24 10:28:28 2011 @brief Yaroslav Halchenko Dartmouth web: http://www.onerussian.com College e-mail: yoh@onerussian.com ICQ#: 60653192 DESCRIPTION (NOTES): COPYRIGHT: Yaroslav Halchenko 2011 LICENSE: MIT Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ #-----------------\____________________________________/------------------ __author__ = 'Yaroslav Halchenko' __revision__ = '$Revision: $' __date__ = '$Date: $' __copyright__ = 'Copyright (c) 2011 Yaroslav Halchenko' __license__ = 'GPL' import os, sys, glob, json, re, shutil from copy import copy from mvpa.base import verbose from common import * verbose.level = 2 datain = 'data' dataout = 'dataout' dataorig = 'dataorig' blacklist = ['1305808539.9.json', '1305808540.1.json', '1305808541.03.json', # persistent and curious mind-ware guy from Israel ] for d in dataout, dataorig: if os.path.exists(d): shutil.rmtree(d) os.makedirs(d) def ndiffer(d1, d2, skip=['timestamp']): n = 0 for key in d1.keys(): if key in skip: continue if d1[key] != d2.get(key, 'XXX'): n += 1 return n ips = {} nwith_ips = 0 unhandled = {} refreshed = {} infiles = glob.glob(os.path.join(datain, '*.json')) skipped = 0 #infiles = glob.glob(os.path.join(datain, '1305741725.57.json')) for f in infiles: fname = os.path.basename(f) if fname in blacklist: verbose(1, "Skipping %s because of blacklist" % f) skipped += 1 continue verbose(5, "Loading %s" % f) j = json.load(open(f)) if [j.get(x) for x in 'man_os', 'pers_os', 'virt_host_os', 'bg_datamod'] == \ ['none', 'none', 'none', None]: verbose(1, "Skipping %s because all systems are nones" % f) skipped += 1 continue if 'remote_addr' in j: nwith_ips += 1 ip = j['remote_addr'] agent = j.get('user_agent', None) previous_entries = ips.get((ip, agent), []) # Let's see if we catch results seekers -- check for how many # fields are identical if len(previous_entries): diffs = [ndiffer(x, j) for x in previous_entries] if min(diffs) < 2: verbose(1, "Skipping %s because there is a previous entry which differs only in %d fields" % (f, min(diffs),)) skipped += 1 continue ips[(ip, agent)] = previous_entries + [j] json.dump(j, open(os.path.join(dataorig, fname), 'w'), indent=2) for ofield, osubs in entries_to_refresh.iteritems(): if not (ofield in j and j[ofield]): continue csv = j[ofield] values = [x.strip().lower() for x in re.split('[+,|;]', csv)] values = [v for v in values if len(v)] original_values = values[:] verbose(3, "Working on %s: %r" % (ofield, values)) for sfield, ssubs in osubs.iteritems(): srecord = copy(j.get(sfield, [])) old_srecord = j.get(sfield, []) for name, (regex, isnew) in ssubs.iteritems(): for i, v in enumerate(values): if v is not None and re.match(regex, v): # Found a match -- need to adjust the record # and replace with None in values values[i] = None if name in old_srecord: verbose(1, "Value %s is already in %s=%s" % (v, sfield, old_srecord)) else: verbose(4, "Adding value %s for %s to %s" % (v, name, sfield)) srecord.append(name) if sfield == 'ignore': # unhandled[v] = unhandled.get(v, 0) + 1 pass else: refreshed[name] = refreshed.get(name, 0) + 1 values = [v for v in values if v is not None] if sfield == 'ignore': verbose(4, "Skipping ignore") continue if srecord != old_srecord: verbose(4, "Adjusting %s to %s" % (old_srecord, srecord)) j[sfield] = srecord if len(values): verbose(4, "Left unhandled: %s" % (values,)) for v in values: unhandled[v] = unhandled.get(v, 0) + 1 verbose(3, "Storing file %s" % fname) # shorten IP j['remote_addr'] = '.'.join(j['remote_addr'].split('.')[:2]) + '.x.x' json.dump(j, open(os.path.join(dataout, fname), 'w'), indent=2) #open(os.path.join(dataout, fname), 'w').write(json.write(j)) bad_ips = [x for x in ips.items() if len(x[1])>1] def ppd(d): keys = sorted(d.keys()) return '\n '.join(["%s: %d" % (k, d[k]) for k in keys]) verbose(1, "=== Refreshed ===\n %s" % ppd(refreshed)) verbose(1, "=== Unhandled ===\n %s" % ppd(unhandled)) verbose(1, "=== Skipped: %d, %d out of %d unique IP/Agent" % (skipped, len(ips), nwith_ips))