]> git.donarmstrong.com Git - neurodebian.git/commitdiff
rename postprocdata.py in the best traditions of Debian
authorYaroslav Halchenko <debian@onerussian.com>
Wed, 25 May 2011 00:36:44 +0000 (20:36 -0400)
committerYaroslav Halchenko <debian@onerussian.com>
Wed, 25 May 2011 00:36:44 +0000 (20:36 -0400)
survey/postprocdata [new file with mode: 0755]
survey/postprocdata.py [deleted file]

diff --git a/survey/postprocdata b/survey/postprocdata
new file mode 100755 (executable)
index 0000000..4cc3724
--- /dev/null
@@ -0,0 +1,235 @@
+#!/usr/bin/python
+#emacs: -*- mode: python-mode; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*- 
+#ex: set sts=4 ts=4 sw=4 noet:
+#------------------------- =+- Python script -+= -------------------------
+"""
+ @file      postprocdata.py
+ @date      Tue May 24 10:28:28 2011
+ @brief
+
+
+  Yaroslav Halchenko                                            Dartmouth
+  web:     http://www.onerussian.com                              College
+  e-mail:  yoh@onerussian.com                              ICQ#: 60653192
+
+ DESCRIPTION (NOTES):
+
+ COPYRIGHT: Yaroslav Halchenko 2011
+
+ LICENSE: MIT
+
+  Permission is hereby granted, free of charge, to any person obtaining a copy
+  of this software and associated documentation files (the "Software"), to deal
+  in the Software without restriction, including without limitation the rights
+  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+  copies of the Software, and to permit persons to whom the Software is
+  furnished to do so, subject to the following conditions:
+
+  The above copyright notice and this permission notice shall be included in
+  all copies or substantial portions of the Software.
+
+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+  THE SOFTWARE.
+"""
+#-----------------\____________________________________/------------------
+
+__author__ = 'Yaroslav Halchenko'
+__revision__ = '$Revision: $'
+__date__ = '$Date:  $'
+__copyright__ = 'Copyright (c) 2011 Yaroslav Halchenko'
+__license__ = 'GPL'
+
+
+import os, sys, glob, json, re, shutil
+from copy import copy
+from mvpa.base import verbose
+verbose.level = 2
+datain = 'data'
+dataout = 'dataout'
+dataorig = 'dataorig'
+
+blacklist = ['1305808539.9.json', '1305808540.1.json', '1305808541.03.json', # persistent and curious mind-ware guy from Israel
+             ]
+
+all_subs = dict(
+    sw_other_name=dict(
+        sw_electro=dict(
+            cedspike='ced *spike2*',                        # NEW: http://www.ced.co.uk/pru.shtml?spk4wglu.htm
+            datariver='exp control: datariver',             # NEW: http://sccn.ucsd.edu/wiki/DataSuite
+            eeglab='(eeglab|http://sccn.ucsd.edu/eeglab/)',
+            emse='emse',                                    # REFRESH
+            erplab='erplab',                                # NEW:     ERPLAB
+            klusters='klusters.*',                          # REFRESH
+            netstation='egi net station',                   # NEW:     EGI Net Station
+            neuroscan='(curry|neuroscan(| curry))',         # REFRESH
+            neuroscope='.*neuroscope',                      # REFRESH
+            nutmeg='.*nutmeg',                              # NEW
+            ),
+        sw_img=dict(
+            mricron='mricrogl',
+            afni='afni for bci',
+            dtistudio='dti-*studio',    # NEW: or MRIStudio?
+            brainsight='brainsight',    # NEW: BrainSight
+            nordicice='nordic ice',     # NEW: NordicICE  -- just 1
+            trackvis='trackvis',
+            xmedcon='xmedcon',          # NEW
+            ),
+        sw_general=dict(
+            lua='lua',                  # NEW
+            stata='stata',              # NEW
+            statistica='statistica',    # NEW
+            java='java',                # REFRESH
+            ),
+        sw_neusys=dict(
+            neuroml='neuroml',          # NEW: NeuroML -- more of a framework/standard than software
+            xpp='xpp(|y|aut)',        # REFRESH: XPP/XPPAUT and Python interface
+            ),
+        sw_psychphys=dict(
+            asf='asf',                  # NEW: ASF  http://code.google.com/p/asf/
+            cogent='cogent(|2000)',     # REFRESH
+            crsvsg='crs toolbox.*',     # NEW: CRS VSG Toolbox  http://www.crsltd.com/catalog/vsgtoolbox/
+            mindware='mind-ware',       # NEW: MindWare
+            nordicaktiva='nordic aktiva', # NEW:    NordicActiva  -- just 1 http://www.nordicneurolab.com/Products_and_Solutions/Clinical_Software_Solutions/nordicActiva.aspx  http://www.nordicneurolab.com/Products_and_Solutions/Clinical_Software_Solutions/nordicAktiva.aspx
+            superlab='superlab',        # REFRESH
+            psignifit='psignifit(|3)',  # NEW
+            ),
+        ignore=dict(ignore=
+                    '(zsh vim mutt git'
+                    # just ignore
+                    '|my overall time.*|separate work.*|60% windows'
+                    '|.*my own .*software'
+                    # Different generic visualization solutions
+                    '|gnupot|.*gnu plot.*xmgrace|mayavi|matplotlib'
+                    '|trackvis'
+                    '|opengl|itk|vtk'
+                    '|paraview'
+                    # Really cool one for graphs
+                    '|gephi'
+                    # Generic DBs
+                    '|mysql|postgresql'
+                    # DB with imaging data (Italy?) but just once
+                    '|loris multi-site database system'
+                    # More languages/platforms?
+                    '|.net|haskel|gsl|cuda'
+                    # Python lovers
+                    '|theano|pygame|numpy|mdp|joblib|scipy|pytables|sympy'
+                    # ML toolboxes
+                    '|scikits-learn|probid .*'
+                    # Reference managers
+                    '|mendeley|jabref'
+                    # Python IDE?? quite nice btw
+                    '|spyder'
+                    # Move into survey?
+                    '|.*magnetic source locator.*' # Some kind of MEG inverse solver -- publications but no public project
+                    ')'
+                    ),
+        ),
+    )
+
+for d in dataout, dataorig:
+    if os.path.exists(d):
+        shutil.rmtree(d)
+    os.makedirs(d)
+
+def ndiffer(d1, d2, skip=['timestamp']):
+    n = 0
+    for key in d1.keys():
+        if key in skip:
+            continue
+        if d1[key] != d2.get(key, 'XXX'):
+            n += 1
+    return n
+
+ips = {}
+nwith_ips = 0
+unhandled = {}
+refreshed = {}
+infiles = glob.glob(os.path.join(datain, '*.json'))
+skipped = 0
+#infiles = glob.glob(os.path.join(datain, '1305741725.57.json'))
+for f in infiles:
+    fname = os.path.basename(f)
+    if fname in blacklist:
+        verbose(1, "Skipping %s because of blacklist" % f)
+        skipped += 1
+        continue
+    verbose(5, "Loading %s" % f)
+    j = json.load(open(f))
+    if [j.get(x) for x in 'man_os', 'pers_os', 'virt_host_os', 'bg_datamod'] == \
+       ['none', 'none', 'none', None]:
+        verbose(1, "Skipping %s because all systems are nones" % f)
+        skipped += 1
+        continue
+
+    if 'remote_addr' in j:
+        nwith_ips += 1
+        ip = j['remote_addr']
+        agent = j.get('user_agent', None)
+        previous_entries = ips.get((ip, agent), [])
+        # Let's see if we catch results seekers -- check for how many
+        # fields are identical
+        if len(previous_entries):
+            diffs = [ndiffer(x, j) for x in previous_entries]
+            if min(diffs) < 2:
+                verbose(1, "Skipping %s because there is a previous entry which differs only in %d fields" % (f, min(diffs),))
+                skipped += 1
+                continue
+        ips[(ip, agent)] = previous_entries + [j]
+
+    json.dump(j, open(os.path.join(dataorig, fname), 'w'), indent=2)
+    for ofield, osubs in all_subs.iteritems():
+        if not (ofield in j and j[ofield]):
+            continue
+        csv = j[ofield]
+        values = [x.strip().lower() for x in re.split('[+,|;]', csv)]
+        values = [v for v in values if len(v)]
+        original_values = values[:]
+        verbose(3, "Working on %s: %r" % (ofield, values))
+        for sfield, ssubs in osubs.iteritems():
+            srecord = copy(j.get(sfield, []))
+            old_srecord = j.get(sfield, [])
+            for name, regex in ssubs.iteritems():
+                for i, v in enumerate(values):
+                    if v is not None and re.match(regex, v):
+                        # Found a match -- need to adjust the record
+                        # and replace with None in values
+                        values[i] = None
+                        if name in old_srecord:
+                            verbose(1, "Value %s is already in %s=%s" % (v, sfield, old_srecord))
+                        else:
+                            verbose(4, "Adding value %s for %s to %s" % (v, name, sfield))
+                            srecord.append(name)
+                        if sfield == 'ignore':
+                            # unhandled[v] = unhandled.get(v, 0) + 1
+                            pass
+                        else:
+                            refreshed[name] = refreshed.get(name, 0) + 1
+            values = [v for v in values if v is not None]
+            if sfield == 'ignore':
+                verbose(4, "Skipping ignore")
+                continue
+            if srecord != old_srecord:
+                verbose(4, "Adjusting %s to %s" % (old_srecord, srecord))
+                j[sfield] = srecord
+        if len(values):
+            verbose(4, "Left unhandled: %s" % (values,))
+            for v in values:
+                unhandled[v] = unhandled.get(v, 0) + 1
+    verbose(3, "Storing file %s" % fname)
+    json.dump(j, open(os.path.join(dataout, fname), 'w'), indent=2)
+    #open(os.path.join(dataout, fname), 'w').write(json.write(j))
+
+bad_ips = [x for x in ips.items() if len(x[1])>1]
+
+def ppd(d):
+    keys = sorted(d.keys())
+    return '\n '.join(["%s: %d" % (k, d[k]) for k in keys])
+
+verbose(1, "=== Refreshed ===\n %s" % ppd(refreshed))
+verbose(1, "=== Unhandled ===\n %s" % ppd(unhandled))
+verbose(1, "=== Skipped: %d, %d out of %d unique IP/Agent" % (skipped, len(ips), nwith_ips))
diff --git a/survey/postprocdata.py b/survey/postprocdata.py
deleted file mode 100755 (executable)
index 4cc3724..0000000
+++ /dev/null
@@ -1,235 +0,0 @@
-#!/usr/bin/python
-#emacs: -*- mode: python-mode; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*- 
-#ex: set sts=4 ts=4 sw=4 noet:
-#------------------------- =+- Python script -+= -------------------------
-"""
- @file      postprocdata.py
- @date      Tue May 24 10:28:28 2011
- @brief
-
-
-  Yaroslav Halchenko                                            Dartmouth
-  web:     http://www.onerussian.com                              College
-  e-mail:  yoh@onerussian.com                              ICQ#: 60653192
-
- DESCRIPTION (NOTES):
-
- COPYRIGHT: Yaroslav Halchenko 2011
-
- LICENSE: MIT
-
-  Permission is hereby granted, free of charge, to any person obtaining a copy
-  of this software and associated documentation files (the "Software"), to deal
-  in the Software without restriction, including without limitation the rights
-  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-  copies of the Software, and to permit persons to whom the Software is
-  furnished to do so, subject to the following conditions:
-
-  The above copyright notice and this permission notice shall be included in
-  all copies or substantial portions of the Software.
-
-  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-  THE SOFTWARE.
-"""
-#-----------------\____________________________________/------------------
-
-__author__ = 'Yaroslav Halchenko'
-__revision__ = '$Revision: $'
-__date__ = '$Date:  $'
-__copyright__ = 'Copyright (c) 2011 Yaroslav Halchenko'
-__license__ = 'GPL'
-
-
-import os, sys, glob, json, re, shutil
-from copy import copy
-from mvpa.base import verbose
-verbose.level = 2
-datain = 'data'
-dataout = 'dataout'
-dataorig = 'dataorig'
-
-blacklist = ['1305808539.9.json', '1305808540.1.json', '1305808541.03.json', # persistent and curious mind-ware guy from Israel
-             ]
-
-all_subs = dict(
-    sw_other_name=dict(
-        sw_electro=dict(
-            cedspike='ced *spike2*',                        # NEW: http://www.ced.co.uk/pru.shtml?spk4wglu.htm
-            datariver='exp control: datariver',             # NEW: http://sccn.ucsd.edu/wiki/DataSuite
-            eeglab='(eeglab|http://sccn.ucsd.edu/eeglab/)',
-            emse='emse',                                    # REFRESH
-            erplab='erplab',                                # NEW:     ERPLAB
-            klusters='klusters.*',                          # REFRESH
-            netstation='egi net station',                   # NEW:     EGI Net Station
-            neuroscan='(curry|neuroscan(| curry))',         # REFRESH
-            neuroscope='.*neuroscope',                      # REFRESH
-            nutmeg='.*nutmeg',                              # NEW
-            ),
-        sw_img=dict(
-            mricron='mricrogl',
-            afni='afni for bci',
-            dtistudio='dti-*studio',    # NEW: or MRIStudio?
-            brainsight='brainsight',    # NEW: BrainSight
-            nordicice='nordic ice',     # NEW: NordicICE  -- just 1
-            trackvis='trackvis',
-            xmedcon='xmedcon',          # NEW
-            ),
-        sw_general=dict(
-            lua='lua',                  # NEW
-            stata='stata',              # NEW
-            statistica='statistica',    # NEW
-            java='java',                # REFRESH
-            ),
-        sw_neusys=dict(
-            neuroml='neuroml',          # NEW: NeuroML -- more of a framework/standard than software
-            xpp='xpp(|y|aut)',        # REFRESH: XPP/XPPAUT and Python interface
-            ),
-        sw_psychphys=dict(
-            asf='asf',                  # NEW: ASF  http://code.google.com/p/asf/
-            cogent='cogent(|2000)',     # REFRESH
-            crsvsg='crs toolbox.*',     # NEW: CRS VSG Toolbox  http://www.crsltd.com/catalog/vsgtoolbox/
-            mindware='mind-ware',       # NEW: MindWare
-            nordicaktiva='nordic aktiva', # NEW:    NordicActiva  -- just 1 http://www.nordicneurolab.com/Products_and_Solutions/Clinical_Software_Solutions/nordicActiva.aspx  http://www.nordicneurolab.com/Products_and_Solutions/Clinical_Software_Solutions/nordicAktiva.aspx
-            superlab='superlab',        # REFRESH
-            psignifit='psignifit(|3)',  # NEW
-            ),
-        ignore=dict(ignore=
-                    '(zsh vim mutt git'
-                    # just ignore
-                    '|my overall time.*|separate work.*|60% windows'
-                    '|.*my own .*software'
-                    # Different generic visualization solutions
-                    '|gnupot|.*gnu plot.*xmgrace|mayavi|matplotlib'
-                    '|trackvis'
-                    '|opengl|itk|vtk'
-                    '|paraview'
-                    # Really cool one for graphs
-                    '|gephi'
-                    # Generic DBs
-                    '|mysql|postgresql'
-                    # DB with imaging data (Italy?) but just once
-                    '|loris multi-site database system'
-                    # More languages/platforms?
-                    '|.net|haskel|gsl|cuda'
-                    # Python lovers
-                    '|theano|pygame|numpy|mdp|joblib|scipy|pytables|sympy'
-                    # ML toolboxes
-                    '|scikits-learn|probid .*'
-                    # Reference managers
-                    '|mendeley|jabref'
-                    # Python IDE?? quite nice btw
-                    '|spyder'
-                    # Move into survey?
-                    '|.*magnetic source locator.*' # Some kind of MEG inverse solver -- publications but no public project
-                    ')'
-                    ),
-        ),
-    )
-
-for d in dataout, dataorig:
-    if os.path.exists(d):
-        shutil.rmtree(d)
-    os.makedirs(d)
-
-def ndiffer(d1, d2, skip=['timestamp']):
-    n = 0
-    for key in d1.keys():
-        if key in skip:
-            continue
-        if d1[key] != d2.get(key, 'XXX'):
-            n += 1
-    return n
-
-ips = {}
-nwith_ips = 0
-unhandled = {}
-refreshed = {}
-infiles = glob.glob(os.path.join(datain, '*.json'))
-skipped = 0
-#infiles = glob.glob(os.path.join(datain, '1305741725.57.json'))
-for f in infiles:
-    fname = os.path.basename(f)
-    if fname in blacklist:
-        verbose(1, "Skipping %s because of blacklist" % f)
-        skipped += 1
-        continue
-    verbose(5, "Loading %s" % f)
-    j = json.load(open(f))
-    if [j.get(x) for x in 'man_os', 'pers_os', 'virt_host_os', 'bg_datamod'] == \
-       ['none', 'none', 'none', None]:
-        verbose(1, "Skipping %s because all systems are nones" % f)
-        skipped += 1
-        continue
-
-    if 'remote_addr' in j:
-        nwith_ips += 1
-        ip = j['remote_addr']
-        agent = j.get('user_agent', None)
-        previous_entries = ips.get((ip, agent), [])
-        # Let's see if we catch results seekers -- check for how many
-        # fields are identical
-        if len(previous_entries):
-            diffs = [ndiffer(x, j) for x in previous_entries]
-            if min(diffs) < 2:
-                verbose(1, "Skipping %s because there is a previous entry which differs only in %d fields" % (f, min(diffs),))
-                skipped += 1
-                continue
-        ips[(ip, agent)] = previous_entries + [j]
-
-    json.dump(j, open(os.path.join(dataorig, fname), 'w'), indent=2)
-    for ofield, osubs in all_subs.iteritems():
-        if not (ofield in j and j[ofield]):
-            continue
-        csv = j[ofield]
-        values = [x.strip().lower() for x in re.split('[+,|;]', csv)]
-        values = [v for v in values if len(v)]
-        original_values = values[:]
-        verbose(3, "Working on %s: %r" % (ofield, values))
-        for sfield, ssubs in osubs.iteritems():
-            srecord = copy(j.get(sfield, []))
-            old_srecord = j.get(sfield, [])
-            for name, regex in ssubs.iteritems():
-                for i, v in enumerate(values):
-                    if v is not None and re.match(regex, v):
-                        # Found a match -- need to adjust the record
-                        # and replace with None in values
-                        values[i] = None
-                        if name in old_srecord:
-                            verbose(1, "Value %s is already in %s=%s" % (v, sfield, old_srecord))
-                        else:
-                            verbose(4, "Adding value %s for %s to %s" % (v, name, sfield))
-                            srecord.append(name)
-                        if sfield == 'ignore':
-                            # unhandled[v] = unhandled.get(v, 0) + 1
-                            pass
-                        else:
-                            refreshed[name] = refreshed.get(name, 0) + 1
-            values = [v for v in values if v is not None]
-            if sfield == 'ignore':
-                verbose(4, "Skipping ignore")
-                continue
-            if srecord != old_srecord:
-                verbose(4, "Adjusting %s to %s" % (old_srecord, srecord))
-                j[sfield] = srecord
-        if len(values):
-            verbose(4, "Left unhandled: %s" % (values,))
-            for v in values:
-                unhandled[v] = unhandled.get(v, 0) + 1
-    verbose(3, "Storing file %s" % fname)
-    json.dump(j, open(os.path.join(dataout, fname), 'w'), indent=2)
-    #open(os.path.join(dataout, fname), 'w').write(json.write(j))
-
-bad_ips = [x for x in ips.items() if len(x[1])>1]
-
-def ppd(d):
-    keys = sorted(d.keys())
-    return '\n '.join(["%s: %d" % (k, d[k]) for k in keys])
-
-verbose(1, "=== Refreshed ===\n %s" % ppd(refreshed))
-verbose(1, "=== Unhandled ===\n %s" % ppd(unhandled))
-verbose(1, "=== Skipped: %d, %d out of %d unique IP/Agent" % (skipped, len(ips), nwith_ips))