2 #emacs: -*- mode: python-mode; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
3 #ex: set sts=4 ts=4 sw=4 noet:
4 #------------------------- =+- Python script -+= -------------------------
7 @date Tue May 24 10:28:28 2011
11 Yaroslav Halchenko Dartmouth
12 web: http://www.onerussian.com College
13 e-mail: yoh@onerussian.com ICQ#: 60653192
17 COPYRIGHT: Yaroslav Halchenko 2011
21 Permission is hereby granted, free of charge, to any person obtaining a copy
22 of this software and associated documentation files (the "Software"), to deal
23 in the Software without restriction, including without limitation the rights
24 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
25 copies of the Software, and to permit persons to whom the Software is
26 furnished to do so, subject to the following conditions:
28 The above copyright notice and this permission notice shall be included in
29 all copies or substantial portions of the Software.
31 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
32 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
33 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
34 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
35 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
36 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
39 #-----------------\____________________________________/------------------
41 __author__ = 'Yaroslav Halchenko'
42 __revision__ = '$Revision: $'
44 __copyright__ = 'Copyright (c) 2011 Yaroslav Halchenko'
48 import os, sys, glob, json, re, shutil
50 from mvpa.base import verbose
58 blacklist = ['1305808539.9.json', '1305808540.1.json', '1305808541.03.json', # persistent and curious mind-ware guy from Israel
62 for d in dataout, dataorig:
67 def ndiffer(d1, d2, skip=['timestamp']):
72 if d1[key] != d2.get(key, 'XXX'):
80 infiles = glob.glob(os.path.join(datain, '*.json'))
82 #infiles = glob.glob(os.path.join(datain, '1305741725.57.json'))
84 fname = os.path.basename(f)
85 if fname in blacklist:
86 verbose(1, "Skipping %s because of blacklist" % f)
89 verbose(5, "Loading %s" % f)
90 j = json.load(open(f))
91 if [j.get(x) for x in 'man_os', 'pers_os', 'virt_host_os', 'bg_datamod'] == \
92 ['none', 'none', 'none', None]:
93 verbose(1, "Skipping %s because all systems are nones" % f)
97 if 'remote_addr' in j:
100 agent = j.get('user_agent', None)
101 previous_entries = ips.get((ip, agent), [])
102 # Let's see if we catch results seekers -- check for how many
103 # fields are identical
104 if len(previous_entries):
105 diffs = [ndiffer(x, j) for x in previous_entries]
107 verbose(1, "Skipping %s because there is a previous entry which differs only in %d fields" % (f, min(diffs),))
110 ips[(ip, agent)] = previous_entries + [j]
112 json.dump(j, open(os.path.join(dataorig, fname), 'w'), indent=2)
113 for ofield, osubs in entries_to_refresh.iteritems():
114 if not (ofield in j and j[ofield]):
117 values = [x.strip().lower() for x in re.split('[+,|;]', csv)]
118 values = [v for v in values if len(v)]
119 original_values = values[:]
120 verbose(3, "Working on %s: %r" % (ofield, values))
121 for sfield, ssubs in osubs.iteritems():
122 srecord = copy(j.get(sfield, []))
123 old_srecord = j.get(sfield, [])
124 for name, (regex, isnew) in ssubs.iteritems():
125 for i, v in enumerate(values):
126 if v is not None and re.match(regex, v):
127 # Found a match -- need to adjust the record
128 # and replace with None in values
130 if name in old_srecord:
131 verbose(1, "Value %s is already in %s=%s" % (v, sfield, old_srecord))
133 verbose(4, "Adding value %s for %s to %s" % (v, name, sfield))
135 if sfield == 'ignore':
136 # unhandled[v] = unhandled.get(v, 0) + 1
139 refreshed[name] = refreshed.get(name, 0) + 1
140 values = [v for v in values if v is not None]
141 if sfield == 'ignore':
142 verbose(4, "Skipping ignore")
144 if srecord != old_srecord:
145 verbose(4, "Adjusting %s to %s" % (old_srecord, srecord))
148 verbose(4, "Left unhandled: %s" % (values,))
150 unhandled[v] = unhandled.get(v, 0) + 1
151 verbose(3, "Storing file %s" % fname)
153 j['remote_addr'] = '.'.join(j['remote_addr'].split('.')[:2]) + '.x.x'
154 json.dump(j, open(os.path.join(dataout, fname), 'w'), indent=2)
155 #open(os.path.join(dataout, fname), 'w').write(json.write(j))
157 bad_ips = [x for x in ips.items() if len(x[1])>1]
160 keys = sorted(d.keys())
161 return '\n '.join(["%s: %d" % (k, d[k]) for k in keys])
163 verbose(1, "=== Refreshed ===\n %s" % ppd(refreshed))
164 verbose(1, "=== Unhandled ===\n %s" % ppd(unhandled))
165 verbose(1, "=== Skipped: %d, %d out of %d unique IP/Agent" % (skipped, len(ips), nwith_ips))