]> git.donarmstrong.com Git - neurodebian.git/blob - survey/postprocdata
Merge branch 'master' of alioth:/git/pkg-exppsy/neurodebian
[neurodebian.git] / survey / postprocdata
1 #!/usr/bin/python
2 #emacs: -*- mode: python-mode; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*- 
3 #ex: set sts=4 ts=4 sw=4 noet:
4 #------------------------- =+- Python script -+= -------------------------
5 """
6  @file      postprocdata.py
7  @date      Tue May 24 10:28:28 2011
8  @brief
9
10
11   Yaroslav Halchenko                                            Dartmouth
12   web:     http://www.onerussian.com                              College
13   e-mail:  yoh@onerussian.com                              ICQ#: 60653192
14
15  DESCRIPTION (NOTES):
16
17  COPYRIGHT: Yaroslav Halchenko 2011
18
19  LICENSE: MIT
20
21   Permission is hereby granted, free of charge, to any person obtaining a copy
22   of this software and associated documentation files (the "Software"), to deal
23   in the Software without restriction, including without limitation the rights
24   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
25   copies of the Software, and to permit persons to whom the Software is
26   furnished to do so, subject to the following conditions:
27
28   The above copyright notice and this permission notice shall be included in
29   all copies or substantial portions of the Software.
30
31   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
32   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
33   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
34   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
35   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
36   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
37   THE SOFTWARE.
38 """
39 #-----------------\____________________________________/------------------
40
41 __author__ = 'Yaroslav Halchenko'
42 __revision__ = '$Revision: $'
43 __date__ = '$Date:  $'
44 __copyright__ = 'Copyright (c) 2011 Yaroslav Halchenko'
45 __license__ = 'GPL'
46
47
48 import os, sys, glob, json, re, shutil
49 from copy import copy
50 from mvpa.base import verbose
51 verbose.level = 2
52 datain = 'data'
53 dataout = 'dataout'
54 dataorig = 'dataorig'
55
56 blacklist = ['1305808539.9.json', '1305808540.1.json', '1305808541.03.json', # persistent and curious mind-ware guy from Israel
57              ]
58
59 all_subs = dict(
60     sw_other_name=dict(
61         sw_electro=dict(
62             cedspike='ced *spike2*',                        # NEW: http://www.ced.co.uk/pru.shtml?spk4wglu.htm
63             datariver='exp control: datariver',             # NEW: http://sccn.ucsd.edu/wiki/DataSuite
64             eeglab='(eeglab|http://sccn.ucsd.edu/eeglab/)',
65             emse='emse',                                    # REFRESH
66             erplab='erplab',                                # NEW:     ERPLAB
67             klusters='klusters.*',                          # REFRESH
68             netstation='egi net station',                   # NEW:     EGI Net Station
69             neuroscan='(curry|neuroscan(| curry))',         # REFRESH
70             neuroscope='.*neuroscope',                      # REFRESH
71             nutmeg='.*nutmeg',                              # NEW
72             ),
73         sw_img=dict(
74             mricron='mricrogl',
75             afni='afni for bci',
76             dtistudio='dti-*studio',    # NEW: or MRIStudio?
77             brainsight='brainsight',    # NEW: BrainSight
78             nordicice='nordic ice',     # NEW: NordicICE  -- just 1
79             trackvis='trackvis',
80             xmedcon='xmedcon',          # NEW
81             ),
82         sw_general=dict(
83             lua='lua',                  # NEW
84             stata='stata',              # NEW
85             statistica='statistica',    # NEW
86             java='java',                # REFRESH
87             ),
88         sw_neusys=dict(
89             neuroml='neuroml',          # NEW: NeuroML -- more of a framework/standard than software
90             xpp='xpp(|y|aut)',        # REFRESH: XPP/XPPAUT and Python interface
91             ),
92         sw_psychphys=dict(
93             asf='asf',                  # NEW: ASF  http://code.google.com/p/asf/
94             cogent='cogent(|2000)',     # REFRESH
95             crsvsg='crs toolbox.*',     # NEW: CRS VSG Toolbox  http://www.crsltd.com/catalog/vsgtoolbox/
96             mindware='mind-ware',       # NEW: MindWare
97             nordicaktiva='nordic aktiva', # NEW:    NordicActiva  -- just 1 http://www.nordicneurolab.com/Products_and_Solutions/Clinical_Software_Solutions/nordicActiva.aspx  http://www.nordicneurolab.com/Products_and_Solutions/Clinical_Software_Solutions/nordicAktiva.aspx
98             superlab='superlab',        # REFRESH
99             psignifit='psignifit(|3)',  # NEW
100             ),
101         ignore=dict(ignore=
102                     '(zsh vim mutt git'
103                     # just ignore
104                     '|my overall time.*|separate work.*|60% windows'
105                     '|.*my own .*software'
106                     # Different generic visualization solutions
107                     '|gnupot|.*gnu plot.*xmgrace|mayavi|matplotlib'
108                     '|trackvis'
109                     '|opengl|itk|vtk'
110                     '|paraview'
111                     # Really cool one for graphs
112                     '|gephi'
113                     # Generic DBs
114                     '|mysql|postgresql'
115                     # DB with imaging data (Italy?) but just once
116                     '|loris multi-site database system'
117                     # More languages/platforms?
118                     '|.net|haskel|gsl|cuda'
119                     # Python lovers
120                     '|theano|pygame|numpy|mdp|joblib|scipy|pytables|sympy'
121                     # ML toolboxes
122                     '|scikits-learn|probid .*'
123                     # Reference managers
124                     '|mendeley|jabref'
125                     # Python IDE?? quite nice btw
126                     '|spyder'
127                     # Move into survey?
128                     '|.*magnetic source locator.*' # Some kind of MEG inverse solver -- publications but no public project
129                     ')'
130                     ),
131         ),
132     )
133
134 for d in dataout, dataorig:
135     if os.path.exists(d):
136         shutil.rmtree(d)
137     os.makedirs(d)
138
139 def ndiffer(d1, d2, skip=['timestamp']):
140     n = 0
141     for key in d1.keys():
142         if key in skip:
143             continue
144         if d1[key] != d2.get(key, 'XXX'):
145             n += 1
146     return n
147
148 ips = {}
149 nwith_ips = 0
150 unhandled = {}
151 refreshed = {}
152 infiles = glob.glob(os.path.join(datain, '*.json'))
153 skipped = 0
154 #infiles = glob.glob(os.path.join(datain, '1305741725.57.json'))
155 for f in infiles:
156     fname = os.path.basename(f)
157     if fname in blacklist:
158         verbose(1, "Skipping %s because of blacklist" % f)
159         skipped += 1
160         continue
161     verbose(5, "Loading %s" % f)
162     j = json.load(open(f))
163     if [j.get(x) for x in 'man_os', 'pers_os', 'virt_host_os', 'bg_datamod'] == \
164        ['none', 'none', 'none', None]:
165         verbose(1, "Skipping %s because all systems are nones" % f)
166         skipped += 1
167         continue
168
169     if 'remote_addr' in j:
170         nwith_ips += 1
171         ip = j['remote_addr']
172         agent = j.get('user_agent', None)
173         previous_entries = ips.get((ip, agent), [])
174         # Let's see if we catch results seekers -- check for how many
175         # fields are identical
176         if len(previous_entries):
177             diffs = [ndiffer(x, j) for x in previous_entries]
178             if min(diffs) < 2:
179                 verbose(1, "Skipping %s because there is a previous entry which differs only in %d fields" % (f, min(diffs),))
180                 skipped += 1
181                 continue
182         ips[(ip, agent)] = previous_entries + [j]
183
184     json.dump(j, open(os.path.join(dataorig, fname), 'w'), indent=2)
185     for ofield, osubs in all_subs.iteritems():
186         if not (ofield in j and j[ofield]):
187             continue
188         csv = j[ofield]
189         values = [x.strip().lower() for x in re.split('[+,|;]', csv)]
190         values = [v for v in values if len(v)]
191         original_values = values[:]
192         verbose(3, "Working on %s: %r" % (ofield, values))
193         for sfield, ssubs in osubs.iteritems():
194             srecord = copy(j.get(sfield, []))
195             old_srecord = j.get(sfield, [])
196             for name, regex in ssubs.iteritems():
197                 for i, v in enumerate(values):
198                     if v is not None and re.match(regex, v):
199                         # Found a match -- need to adjust the record
200                         # and replace with None in values
201                         values[i] = None
202                         if name in old_srecord:
203                             verbose(1, "Value %s is already in %s=%s" % (v, sfield, old_srecord))
204                         else:
205                             verbose(4, "Adding value %s for %s to %s" % (v, name, sfield))
206                             srecord.append(name)
207                         if sfield == 'ignore':
208                             # unhandled[v] = unhandled.get(v, 0) + 1
209                             pass
210                         else:
211                             refreshed[name] = refreshed.get(name, 0) + 1
212             values = [v for v in values if v is not None]
213             if sfield == 'ignore':
214                 verbose(4, "Skipping ignore")
215                 continue
216             if srecord != old_srecord:
217                 verbose(4, "Adjusting %s to %s" % (old_srecord, srecord))
218                 j[sfield] = srecord
219         if len(values):
220             verbose(4, "Left unhandled: %s" % (values,))
221             for v in values:
222                 unhandled[v] = unhandled.get(v, 0) + 1
223     verbose(3, "Storing file %s" % fname)
224     json.dump(j, open(os.path.join(dataout, fname), 'w'), indent=2)
225     #open(os.path.join(dataout, fname), 'w').write(json.write(j))
226
227 bad_ips = [x for x in ips.items() if len(x[1])>1]
228
229 def ppd(d):
230     keys = sorted(d.keys())
231     return '\n '.join(["%s: %d" % (k, d[k]) for k in keys])
232
233 verbose(1, "=== Refreshed ===\n %s" % ppd(refreshed))
234 verbose(1, "=== Unhandled ===\n %s" % ppd(unhandled))
235 verbose(1, "=== Skipped: %d, %d out of %d unique IP/Agent" % (skipped, len(ips), nwith_ips))