2 #emacs: -*- mode: python-mode; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
3 #ex: set sts=4 ts=4 sw=4 noet:
4 #------------------------- =+- Python script -+= -------------------------
7 @date Tue May 24 10:28:28 2011
11 Yaroslav Halchenko Dartmouth
12 web: http://www.onerussian.com College
13 e-mail: yoh@onerussian.com ICQ#: 60653192
17 COPYRIGHT: Yaroslav Halchenko 2011
21 Permission is hereby granted, free of charge, to any person obtaining a copy
22 of this software and associated documentation files (the "Software"), to deal
23 in the Software without restriction, including without limitation the rights
24 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
25 copies of the Software, and to permit persons to whom the Software is
26 furnished to do so, subject to the following conditions:
28 The above copyright notice and this permission notice shall be included in
29 all copies or substantial portions of the Software.
31 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
32 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
33 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
34 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
35 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
36 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
39 #-----------------\____________________________________/------------------
41 __author__ = 'Yaroslav Halchenko'
42 __revision__ = '$Revision: $'
44 __copyright__ = 'Copyright (c) 2011 Yaroslav Halchenko'
48 import os, sys, glob, json, re, shutil
50 from mvpa.base import verbose
56 blacklist = ['1305808539.9.json', '1305808540.1.json', '1305808541.03.json', # persistent and curious mind-ware guy from Israel
62 cedspike='ced *spike2*', # NEW: http://www.ced.co.uk/pru.shtml?spk4wglu.htm
63 datariver='exp control: datariver', # NEW: http://sccn.ucsd.edu/wiki/DataSuite
64 eeglab='(eeglab|http://sccn.ucsd.edu/eeglab/)',
65 emse='emse', # REFRESH
66 erplab='erplab', # NEW: ERPLAB
67 klusters='klusters.*', # REFRESH
68 netstation='egi net station', # NEW: EGI Net Station
69 neuroscan='(curry|neuroscan(| curry))', # REFRESH
70 neuroscope='.*neuroscope', # REFRESH
71 nutmeg='.*nutmeg', # NEW
76 dtistudio='dti-*studio', # NEW: or MRIStudio?
77 brainsight='brainsight', # NEW: BrainSight
78 nordicice='nordic ice', # NEW: NordicICE -- just 1
80 xmedcon='xmedcon', # NEW
85 statistica='statistica', # NEW
86 java='java', # REFRESH
89 neuroml='neuroml', # NEW: NeuroML -- more of a framework/standard than software
90 xpp='xpp(|y|aut)', # REFRESH: XPP/XPPAUT and Python interface
93 asf='asf', # NEW: ASF http://code.google.com/p/asf/
94 cogent='cogent(|2000)', # REFRESH
95 crsvsg='crs toolbox.*', # NEW: CRS VSG Toolbox http://www.crsltd.com/catalog/vsgtoolbox/
96 mindware='mind-ware', # NEW: MindWare
97 nordicaktiva='nordic aktiva', # NEW: NordicActiva -- just 1 http://www.nordicneurolab.com/Products_and_Solutions/Clinical_Software_Solutions/nordicActiva.aspx http://www.nordicneurolab.com/Products_and_Solutions/Clinical_Software_Solutions/nordicAktiva.aspx
98 superlab='superlab', # REFRESH
99 psignifit='psignifit(|3)', # NEW
104 '|my overall time.*|separate work.*|60% windows'
105 '|.*my own .*software'
106 # Different generic visualization solutions
107 '|gnupot|.*gnu plot.*xmgrace|mayavi|matplotlib'
111 # Really cool one for graphs
115 # DB with imaging data (Italy?) but just once
116 '|loris multi-site database system'
117 # More languages/platforms?
118 '|.net|haskel|gsl|cuda'
120 '|theano|pygame|numpy|mdp|joblib|scipy|pytables|sympy'
122 '|scikits-learn|probid .*'
125 # Python IDE?? quite nice btw
128 '|.*magnetic source locator.*' # Some kind of MEG inverse solver -- publications but no public project
134 for d in dataout, dataorig:
135 if os.path.exists(d):
139 def ndiffer(d1, d2, skip=['timestamp']):
141 for key in d1.keys():
144 if d1[key] != d2.get(key, 'XXX'):
152 infiles = glob.glob(os.path.join(datain, '*.json'))
154 #infiles = glob.glob(os.path.join(datain, '1305741725.57.json'))
156 fname = os.path.basename(f)
157 if fname in blacklist:
158 verbose(1, "Skipping %s because of blacklist" % f)
161 verbose(5, "Loading %s" % f)
162 j = json.load(open(f))
163 if [j.get(x) for x in 'man_os', 'pers_os', 'virt_host_os', 'bg_datamod'] == \
164 ['none', 'none', 'none', None]:
165 verbose(1, "Skipping %s because all systems are nones" % f)
169 if 'remote_addr' in j:
171 ip = j['remote_addr']
172 agent = j.get('user_agent', None)
173 previous_entries = ips.get((ip, agent), [])
174 # Let's see if we catch results seekers -- check for how many
175 # fields are identical
176 if len(previous_entries):
177 diffs = [ndiffer(x, j) for x in previous_entries]
179 verbose(1, "Skipping %s because there is a previous entry which differs only in %d fields" % (f, min(diffs),))
182 ips[(ip, agent)] = previous_entries + [j]
184 json.dump(j, open(os.path.join(dataorig, fname), 'w'), indent=2)
185 for ofield, osubs in all_subs.iteritems():
186 if not (ofield in j and j[ofield]):
189 values = [x.strip().lower() for x in re.split('[+,|;]', csv)]
190 values = [v for v in values if len(v)]
191 original_values = values[:]
192 verbose(3, "Working on %s: %r" % (ofield, values))
193 for sfield, ssubs in osubs.iteritems():
194 srecord = copy(j.get(sfield, []))
195 old_srecord = j.get(sfield, [])
196 for name, regex in ssubs.iteritems():
197 for i, v in enumerate(values):
198 if v is not None and re.match(regex, v):
199 # Found a match -- need to adjust the record
200 # and replace with None in values
202 if name in old_srecord:
203 verbose(1, "Value %s is already in %s=%s" % (v, sfield, old_srecord))
205 verbose(4, "Adding value %s for %s to %s" % (v, name, sfield))
207 if sfield == 'ignore':
208 # unhandled[v] = unhandled.get(v, 0) + 1
211 refreshed[name] = refreshed.get(name, 0) + 1
212 values = [v for v in values if v is not None]
213 if sfield == 'ignore':
214 verbose(4, "Skipping ignore")
216 if srecord != old_srecord:
217 verbose(4, "Adjusting %s to %s" % (old_srecord, srecord))
220 verbose(4, "Left unhandled: %s" % (values,))
222 unhandled[v] = unhandled.get(v, 0) + 1
223 verbose(3, "Storing file %s" % fname)
224 json.dump(j, open(os.path.join(dataout, fname), 'w'), indent=2)
225 #open(os.path.join(dataout, fname), 'w').write(json.write(j))
227 bad_ips = [x for x in ips.items() if len(x[1])>1]
230 keys = sorted(d.keys())
231 return '\n '.join(["%s: %d" % (k, d[k]) for k in keys])
233 verbose(1, "=== Refreshed ===\n %s" % ppd(refreshed))
234 verbose(1, "=== Unhandled ===\n %s" % ppd(unhandled))
235 verbose(1, "=== Skipped: %d, %d out of %d unique IP/Agent" % (skipped, len(ips), nwith_ips))