]> git.donarmstrong.com Git - neurodebian.git/blob - survey/postprocdata.py
adding the script to postprocess the data and fold "Others" in
[neurodebian.git] / survey / postprocdata.py
1 #!/usr/bin/python
2 #emacs: -*- mode: python-mode; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*- 
3 #ex: set sts=4 ts=4 sw=4 noet:
4 #------------------------- =+- Python script -+= -------------------------
5 """
6  @file      postprocdata.py
7  @date      Tue May 24 10:28:28 2011
8  @brief
9
10
11   Yaroslav Halchenko                                            Dartmouth
12   web:     http://www.onerussian.com                              College
13   e-mail:  yoh@onerussian.com                              ICQ#: 60653192
14
15  DESCRIPTION (NOTES):
16
17  COPYRIGHT: Yaroslav Halchenko 2011
18
19  LICENSE: MIT
20
21   Permission is hereby granted, free of charge, to any person obtaining a copy
22   of this software and associated documentation files (the "Software"), to deal
23   in the Software without restriction, including without limitation the rights
24   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
25   copies of the Software, and to permit persons to whom the Software is
26   furnished to do so, subject to the following conditions:
27
28   The above copyright notice and this permission notice shall be included in
29   all copies or substantial portions of the Software.
30
31   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
32   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
33   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
34   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
35   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
36   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
37   THE SOFTWARE.
38 """
39 #-----------------\____________________________________/------------------
40
41 __author__ = 'Yaroslav Halchenko'
42 __revision__ = '$Revision: $'
43 __date__ = '$Date:  $'
44 __copyright__ = 'Copyright (c) 2011 Yaroslav Halchenko'
45 __license__ = 'GPL'
46
47
48 import os, sys, glob, json, re
49 from copy import copy
50 from mvpa.base import verbose
51 verbose.level = 4
52 datain = 'data'
53 dataout = 'dataout'
54 dataorig = 'dataorig'
55
56 blacklist = ['1305808539.9.json', '1305808540.1.json', '1305808541.03.json', # persistent and curious mind-ware guy from Israel
57              ]
58
59 all_subs = dict(
60     sw_other_name=dict(
61         sw_electro=dict(
62             cedspike='ced *spike2*',                        # NEW: http://www.ced.co.uk/pru.shtml?spk4wglu.htm
63             datariver='exp control: datariver',             # NEW: http://sccn.ucsd.edu/wiki/DataSuite
64             eeglab='(eeglab|http://sccn.ucsd.edu/eeglab/)',
65             emse='emse',                                    # REFRESH
66             erplab='erplab',                                # NEW:     ERPLAB
67             klusters='klusters.*',                          # REFRESH
68             netstation='egi net station',                   # NEW:     EGI Net Station
69             neuroscan='(curry|neuroscan(| curry))',         # REFRESH
70             neuroscope='.*neuroscope',                      # REFRESH
71             nutmeg='.*nutmeg',                              # NEW
72             ),
73         sw_img=dict(
74             mricron='mricrogl',
75             afni='afni for bci',
76             dtistudio='dti-*studio',    # NEW: or MRIStudio?
77             brainsight='brainsight',    # NEW: BrainSight
78             nordicice='nordic ice',     # NEW: NordicICE  -- just 1
79             trackvis='trackvis',
80             xmedcon='xmedcon',          # NEW
81             ),
82         sw_general=dict(
83             lua='lua',                  # NEW
84             stata='stata',              # NEW
85             statistica='statistica',    # NEW
86             java='java',                # REFRESH
87             ),
88         sw_neusys=dict(
89             neuroml='neuroml',          # NEW: NeuroML -- more of a framework/standard than software
90             xpp='xpp(|y|aut)',        # REFRESH: XPP/XPPAUT and Python interface
91             ),
92         sw_psychphys=dict(
93             asf='asf',                  # NEW: ASF  http://code.google.com/p/asf/
94             cogent='cogent(|2000)',     # REFRESH
95             crsvsg='crs toolbox.*',     # NEW: CRS VSG Toolbox  http://www.crsltd.com/catalog/vsgtoolbox/
96             mindware='mind-ware',       # NEW: MindWare
97             nordicaktiva='nordic aktiva', # NEW:    NordicActiva  -- just 1 http://www.nordicneurolab.com/Products_and_Solutions/Clinical_Software_Solutions/nordicActiva.aspx  http://www.nordicneurolab.com/Products_and_Solutions/Clinical_Software_Solutions/nordicAktiva.aspx
98             superlab='superlab',        # REFRESH
99             psignifit='psignifit(|3)',  # NEW
100             ),
101         ignore=dict(ignore=
102                     '(zsh vim mutt git'
103                     # just ignore
104                     '|my overall time.*|separate work.*|60% windows'
105                     '|.*my own .*software'
106                     # Different generic visualization solutions
107                     '|gnupot|.*gnu plot.*xmgrace|mayavi|matplotlib'
108                     '|trackvis'
109                     '|opengl|itk|vtk'
110                     '|paraview'
111                     # Really cool one for graphs
112                     '|gephi'
113                     # Generic DBs
114                     '|mysql|postgresql'
115                     # DB with imaging data (Italy?) but just once
116                     '|loris multi-site database system'
117                     # More languages/platforms?
118                     '|.net|haskel|gsl|cuda'
119                     # Python lovers
120                     '|theano|pygame|numpy|mdp|joblib|scipy|pytables|sympy'
121                     # ML toolboxes
122                     '|scikits-learn|probid .*'
123                     # Reference managers
124                     '|mendeley|jabref'
125                     # Python IDE?? quite nice btw
126                     '|spyder'
127                     # Move into survey?
128                     '|.*magnetic source locator.*' # Some kind of MEG inverse solver -- publications but no public project
129                     ')'
130                     ),
131         ),
132     )
133
134 for d in dataout, dataorig:
135     if os.path.exists(d):
136         shutil.rmtree(d)
137     os.makedirs(d)
138
139 unhandled = {}
140 refreshed = {}
141 infiles = glob.glob(os.path.join(datain, '*.json'))
142 skipped = 0
143 #infiles = glob.glob(os.path.join(datain, '1305741725.57.json'))
144 for f in infiles:
145     fname = os.path.basename(f)
146     if fname in blacklist:
147         verbose(1, "Skipping %s because of blacklist" % f)
148         skipped += 1
149         continue
150
151     verbose(5, "Loading %s" % f)
152     j = json.load(open(f))
153     json.dump(j, open(os.path.join(dataorig, fname), 'w'), indent=2)
154     for ofield, osubs in all_subs.iteritems():
155         if not (ofield in j and j[ofield]):
156             continue
157         csv = j[ofield]
158         values = [x.strip().lower() for x in re.split('[+,|;]', csv)]
159         values = [v for v in values if len(v)]
160         original_values = values[:]
161         verbose(3, "Working on %s: %r" % (ofield, values))
162         for sfield, ssubs in osubs.iteritems():
163             srecord = copy(j.get(sfield, []))
164             old_srecord = j.get(sfield, [])
165             for name, regex in ssubs.iteritems():
166                 for i, v in enumerate(values):
167                     if v is not None and re.match(regex, v):
168                         # Found a match -- need to adjust the record
169                         # and replace with None in values
170                         values[i] = None
171                         if name in old_srecord:
172                             verbose(1, "Value %s is already in %s=%s" % (v, sfield, old_srecord))
173                         else:
174                             verbose(4, "Adding value %s for %s to %s" % (v, name, sfield))
175                             srecord.append(name)
176                         if sfield == 'ignore':
177                             # unhandled[v] = unhandled.get(v, 0) + 1
178                             pass
179                         else:
180                             refreshed[name] = refreshed.get(name, 0) + 1
181             values = [v for v in values if v is not None]
182             if sfield == 'ignore':
183                 verbose(4, "Skipping ignore")
184                 continue
185             if srecord != old_srecord:
186                 verbose(4, "Adjusting %s to %s" % (old_srecord, srecord))
187                 j[sfield] = srecord
188         if len(values):
189             verbose(4, "Left unhandled: %s" % (values,))
190             for v in values:
191                 unhandled[v] = unhandled.get(v, 0) + 1
192     verbose(3, "Storing file %s" % fname)
193     json.dump(j, open(os.path.join(dataout, fname), 'w'), indent=2)
194     #open(os.path.join(dataout, fname), 'w').write(json.write(j))
195
196 def ppd(d):
197     keys = sorted(d.keys())
198     return '\n '.join(["%s: %d" % (k, d[k]) for k in keys])
199
200 verbose(1, "=== Refreshed ===\n %s" % ppd(refreshed))
201 verbose(1, "=== Unhandled ===\n %s" % ppd(unhandled))
202 verbose(1, "=== Skipped: %d" % skipped)