--- /dev/null
+#!/usr/bin/python
+#emacs: -*- mode: python-mode; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
+#ex: set sts=4 ts=4 sw=4 noet:
+#------------------------- =+- Python script -+= -------------------------
+"""
+ @file postprocdata.py
+ @date Tue May 24 10:28:28 2011
+ @brief
+
+
+ Yaroslav Halchenko Dartmouth
+ web: http://www.onerussian.com College
+ e-mail: yoh@onerussian.com ICQ#: 60653192
+
+ DESCRIPTION (NOTES):
+
+ COPYRIGHT: Yaroslav Halchenko 2011
+
+ LICENSE: MIT
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+"""
+#-----------------\____________________________________/------------------
+
+__author__ = 'Yaroslav Halchenko'
+__revision__ = '$Revision: $'
+__date__ = '$Date: $'
+__copyright__ = 'Copyright (c) 2011 Yaroslav Halchenko'
+__license__ = 'GPL'
+
+
+import os, sys, glob, json, re, shutil
+from copy import copy
+from mvpa.base import verbose
+verbose.level = 2
+datain = 'data'
+dataout = 'dataout'
+dataorig = 'dataorig'
+
+blacklist = ['1305808539.9.json', '1305808540.1.json', '1305808541.03.json', # persistent and curious mind-ware guy from Israel
+ ]
+
+all_subs = dict(
+ sw_other_name=dict(
+ sw_electro=dict(
+ cedspike='ced *spike2*', # NEW: http://www.ced.co.uk/pru.shtml?spk4wglu.htm
+ datariver='exp control: datariver', # NEW: http://sccn.ucsd.edu/wiki/DataSuite
+ eeglab='(eeglab|http://sccn.ucsd.edu/eeglab/)',
+ emse='emse', # REFRESH
+ erplab='erplab', # NEW: ERPLAB
+ klusters='klusters.*', # REFRESH
+ netstation='egi net station', # NEW: EGI Net Station
+ neuroscan='(curry|neuroscan(| curry))', # REFRESH
+ neuroscope='.*neuroscope', # REFRESH
+ nutmeg='.*nutmeg', # NEW
+ ),
+ sw_img=dict(
+ mricron='mricrogl',
+ afni='afni for bci',
+ dtistudio='dti-*studio', # NEW: or MRIStudio?
+ brainsight='brainsight', # NEW: BrainSight
+ nordicice='nordic ice', # NEW: NordicICE -- just 1
+ trackvis='trackvis',
+ xmedcon='xmedcon', # NEW
+ ),
+ sw_general=dict(
+ lua='lua', # NEW
+ stata='stata', # NEW
+ statistica='statistica', # NEW
+ java='java', # REFRESH
+ ),
+ sw_neusys=dict(
+ neuroml='neuroml', # NEW: NeuroML -- more of a framework/standard than software
+ xpp='xpp(|y|aut)', # REFRESH: XPP/XPPAUT and Python interface
+ ),
+ sw_psychphys=dict(
+ asf='asf', # NEW: ASF http://code.google.com/p/asf/
+ cogent='cogent(|2000)', # REFRESH
+ crsvsg='crs toolbox.*', # NEW: CRS VSG Toolbox http://www.crsltd.com/catalog/vsgtoolbox/
+ mindware='mind-ware', # NEW: MindWare
+ nordicaktiva='nordic aktiva', # NEW: NordicActiva -- just 1 http://www.nordicneurolab.com/Products_and_Solutions/Clinical_Software_Solutions/nordicActiva.aspx http://www.nordicneurolab.com/Products_and_Solutions/Clinical_Software_Solutions/nordicAktiva.aspx
+ superlab='superlab', # REFRESH
+ psignifit='psignifit(|3)', # NEW
+ ),
+ ignore=dict(ignore=
+ '(zsh vim mutt git'
+ # just ignore
+ '|my overall time.*|separate work.*|60% windows'
+ '|.*my own .*software'
+ # Different generic visualization solutions
+ '|gnupot|.*gnu plot.*xmgrace|mayavi|matplotlib'
+ '|trackvis'
+ '|opengl|itk|vtk'
+ '|paraview'
+ # Really cool one for graphs
+ '|gephi'
+ # Generic DBs
+ '|mysql|postgresql'
+ # DB with imaging data (Italy?) but just once
+ '|loris multi-site database system'
+ # More languages/platforms?
+ '|.net|haskel|gsl|cuda'
+ # Python lovers
+ '|theano|pygame|numpy|mdp|joblib|scipy|pytables|sympy'
+ # ML toolboxes
+ '|scikits-learn|probid .*'
+ # Reference managers
+ '|mendeley|jabref'
+ # Python IDE?? quite nice btw
+ '|spyder'
+ # Move into survey?
+ '|.*magnetic source locator.*' # Some kind of MEG inverse solver -- publications but no public project
+ ')'
+ ),
+ ),
+ )
+
+for d in dataout, dataorig:
+ if os.path.exists(d):
+ shutil.rmtree(d)
+ os.makedirs(d)
+
+def ndiffer(d1, d2, skip=['timestamp']):
+ n = 0
+ for key in d1.keys():
+ if key in skip:
+ continue
+ if d1[key] != d2.get(key, 'XXX'):
+ n += 1
+ return n
+
+ips = {}
+nwith_ips = 0
+unhandled = {}
+refreshed = {}
+infiles = glob.glob(os.path.join(datain, '*.json'))
+skipped = 0
+#infiles = glob.glob(os.path.join(datain, '1305741725.57.json'))
+for f in infiles:
+ fname = os.path.basename(f)
+ if fname in blacklist:
+ verbose(1, "Skipping %s because of blacklist" % f)
+ skipped += 1
+ continue
+ verbose(5, "Loading %s" % f)
+ j = json.load(open(f))
+ if [j.get(x) for x in 'man_os', 'pers_os', 'virt_host_os', 'bg_datamod'] == \
+ ['none', 'none', 'none', None]:
+ verbose(1, "Skipping %s because all systems are nones" % f)
+ skipped += 1
+ continue
+
+ if 'remote_addr' in j:
+ nwith_ips += 1
+ ip = j['remote_addr']
+ agent = j.get('user_agent', None)
+ previous_entries = ips.get((ip, agent), [])
+ # Let's see if we catch results seekers -- check for how many
+ # fields are identical
+ if len(previous_entries):
+ diffs = [ndiffer(x, j) for x in previous_entries]
+ if min(diffs) < 2:
+ verbose(1, "Skipping %s because there is a previous entry which differs only in %d fields" % (f, min(diffs),))
+ skipped += 1
+ continue
+ ips[(ip, agent)] = previous_entries + [j]
+
+ json.dump(j, open(os.path.join(dataorig, fname), 'w'), indent=2)
+ for ofield, osubs in all_subs.iteritems():
+ if not (ofield in j and j[ofield]):
+ continue
+ csv = j[ofield]
+ values = [x.strip().lower() for x in re.split('[+,|;]', csv)]
+ values = [v for v in values if len(v)]
+ original_values = values[:]
+ verbose(3, "Working on %s: %r" % (ofield, values))
+ for sfield, ssubs in osubs.iteritems():
+ srecord = copy(j.get(sfield, []))
+ old_srecord = j.get(sfield, [])
+ for name, regex in ssubs.iteritems():
+ for i, v in enumerate(values):
+ if v is not None and re.match(regex, v):
+ # Found a match -- need to adjust the record
+ # and replace with None in values
+ values[i] = None
+ if name in old_srecord:
+ verbose(1, "Value %s is already in %s=%s" % (v, sfield, old_srecord))
+ else:
+ verbose(4, "Adding value %s for %s to %s" % (v, name, sfield))
+ srecord.append(name)
+ if sfield == 'ignore':
+ # unhandled[v] = unhandled.get(v, 0) + 1
+ pass
+ else:
+ refreshed[name] = refreshed.get(name, 0) + 1
+ values = [v for v in values if v is not None]
+ if sfield == 'ignore':
+ verbose(4, "Skipping ignore")
+ continue
+ if srecord != old_srecord:
+ verbose(4, "Adjusting %s to %s" % (old_srecord, srecord))
+ j[sfield] = srecord
+ if len(values):
+ verbose(4, "Left unhandled: %s" % (values,))
+ for v in values:
+ unhandled[v] = unhandled.get(v, 0) + 1
+ verbose(3, "Storing file %s" % fname)
+ json.dump(j, open(os.path.join(dataout, fname), 'w'), indent=2)
+ #open(os.path.join(dataout, fname), 'w').write(json.write(j))
+
+bad_ips = [x for x in ips.items() if len(x[1])>1]
+
+def ppd(d):
+ keys = sorted(d.keys())
+ return '\n '.join(["%s: %d" % (k, d[k]) for k in keys])
+
+verbose(1, "=== Refreshed ===\n %s" % ppd(refreshed))
+verbose(1, "=== Unhandled ===\n %s" % ppd(unhandled))
+verbose(1, "=== Skipped: %d, %d out of %d unique IP/Agent" % (skipped, len(ips), nwith_ips))
+++ /dev/null
-#!/usr/bin/python
-#emacs: -*- mode: python-mode; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
-#ex: set sts=4 ts=4 sw=4 noet:
-#------------------------- =+- Python script -+= -------------------------
-"""
- @file postprocdata.py
- @date Tue May 24 10:28:28 2011
- @brief
-
-
- Yaroslav Halchenko Dartmouth
- web: http://www.onerussian.com College
- e-mail: yoh@onerussian.com ICQ#: 60653192
-
- DESCRIPTION (NOTES):
-
- COPYRIGHT: Yaroslav Halchenko 2011
-
- LICENSE: MIT
-
- Permission is hereby granted, free of charge, to any person obtaining a copy
- of this software and associated documentation files (the "Software"), to deal
- in the Software without restriction, including without limitation the rights
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- copies of the Software, and to permit persons to whom the Software is
- furnished to do so, subject to the following conditions:
-
- The above copyright notice and this permission notice shall be included in
- all copies or substantial portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- THE SOFTWARE.
-"""
-#-----------------\____________________________________/------------------
-
-__author__ = 'Yaroslav Halchenko'
-__revision__ = '$Revision: $'
-__date__ = '$Date: $'
-__copyright__ = 'Copyright (c) 2011 Yaroslav Halchenko'
-__license__ = 'GPL'
-
-
-import os, sys, glob, json, re, shutil
-from copy import copy
-from mvpa.base import verbose
-verbose.level = 2
-datain = 'data'
-dataout = 'dataout'
-dataorig = 'dataorig'
-
-blacklist = ['1305808539.9.json', '1305808540.1.json', '1305808541.03.json', # persistent and curious mind-ware guy from Israel
- ]
-
-all_subs = dict(
- sw_other_name=dict(
- sw_electro=dict(
- cedspike='ced *spike2*', # NEW: http://www.ced.co.uk/pru.shtml?spk4wglu.htm
- datariver='exp control: datariver', # NEW: http://sccn.ucsd.edu/wiki/DataSuite
- eeglab='(eeglab|http://sccn.ucsd.edu/eeglab/)',
- emse='emse', # REFRESH
- erplab='erplab', # NEW: ERPLAB
- klusters='klusters.*', # REFRESH
- netstation='egi net station', # NEW: EGI Net Station
- neuroscan='(curry|neuroscan(| curry))', # REFRESH
- neuroscope='.*neuroscope', # REFRESH
- nutmeg='.*nutmeg', # NEW
- ),
- sw_img=dict(
- mricron='mricrogl',
- afni='afni for bci',
- dtistudio='dti-*studio', # NEW: or MRIStudio?
- brainsight='brainsight', # NEW: BrainSight
- nordicice='nordic ice', # NEW: NordicICE -- just 1
- trackvis='trackvis',
- xmedcon='xmedcon', # NEW
- ),
- sw_general=dict(
- lua='lua', # NEW
- stata='stata', # NEW
- statistica='statistica', # NEW
- java='java', # REFRESH
- ),
- sw_neusys=dict(
- neuroml='neuroml', # NEW: NeuroML -- more of a framework/standard than software
- xpp='xpp(|y|aut)', # REFRESH: XPP/XPPAUT and Python interface
- ),
- sw_psychphys=dict(
- asf='asf', # NEW: ASF http://code.google.com/p/asf/
- cogent='cogent(|2000)', # REFRESH
- crsvsg='crs toolbox.*', # NEW: CRS VSG Toolbox http://www.crsltd.com/catalog/vsgtoolbox/
- mindware='mind-ware', # NEW: MindWare
- nordicaktiva='nordic aktiva', # NEW: NordicActiva -- just 1 http://www.nordicneurolab.com/Products_and_Solutions/Clinical_Software_Solutions/nordicActiva.aspx http://www.nordicneurolab.com/Products_and_Solutions/Clinical_Software_Solutions/nordicAktiva.aspx
- superlab='superlab', # REFRESH
- psignifit='psignifit(|3)', # NEW
- ),
- ignore=dict(ignore=
- '(zsh vim mutt git'
- # just ignore
- '|my overall time.*|separate work.*|60% windows'
- '|.*my own .*software'
- # Different generic visualization solutions
- '|gnupot|.*gnu plot.*xmgrace|mayavi|matplotlib'
- '|trackvis'
- '|opengl|itk|vtk'
- '|paraview'
- # Really cool one for graphs
- '|gephi'
- # Generic DBs
- '|mysql|postgresql'
- # DB with imaging data (Italy?) but just once
- '|loris multi-site database system'
- # More languages/platforms?
- '|.net|haskel|gsl|cuda'
- # Python lovers
- '|theano|pygame|numpy|mdp|joblib|scipy|pytables|sympy'
- # ML toolboxes
- '|scikits-learn|probid .*'
- # Reference managers
- '|mendeley|jabref'
- # Python IDE?? quite nice btw
- '|spyder'
- # Move into survey?
- '|.*magnetic source locator.*' # Some kind of MEG inverse solver -- publications but no public project
- ')'
- ),
- ),
- )
-
-for d in dataout, dataorig:
- if os.path.exists(d):
- shutil.rmtree(d)
- os.makedirs(d)
-
-def ndiffer(d1, d2, skip=['timestamp']):
- n = 0
- for key in d1.keys():
- if key in skip:
- continue
- if d1[key] != d2.get(key, 'XXX'):
- n += 1
- return n
-
-ips = {}
-nwith_ips = 0
-unhandled = {}
-refreshed = {}
-infiles = glob.glob(os.path.join(datain, '*.json'))
-skipped = 0
-#infiles = glob.glob(os.path.join(datain, '1305741725.57.json'))
-for f in infiles:
- fname = os.path.basename(f)
- if fname in blacklist:
- verbose(1, "Skipping %s because of blacklist" % f)
- skipped += 1
- continue
- verbose(5, "Loading %s" % f)
- j = json.load(open(f))
- if [j.get(x) for x in 'man_os', 'pers_os', 'virt_host_os', 'bg_datamod'] == \
- ['none', 'none', 'none', None]:
- verbose(1, "Skipping %s because all systems are nones" % f)
- skipped += 1
- continue
-
- if 'remote_addr' in j:
- nwith_ips += 1
- ip = j['remote_addr']
- agent = j.get('user_agent', None)
- previous_entries = ips.get((ip, agent), [])
- # Let's see if we catch results seekers -- check for how many
- # fields are identical
- if len(previous_entries):
- diffs = [ndiffer(x, j) for x in previous_entries]
- if min(diffs) < 2:
- verbose(1, "Skipping %s because there is a previous entry which differs only in %d fields" % (f, min(diffs),))
- skipped += 1
- continue
- ips[(ip, agent)] = previous_entries + [j]
-
- json.dump(j, open(os.path.join(dataorig, fname), 'w'), indent=2)
- for ofield, osubs in all_subs.iteritems():
- if not (ofield in j and j[ofield]):
- continue
- csv = j[ofield]
- values = [x.strip().lower() for x in re.split('[+,|;]', csv)]
- values = [v for v in values if len(v)]
- original_values = values[:]
- verbose(3, "Working on %s: %r" % (ofield, values))
- for sfield, ssubs in osubs.iteritems():
- srecord = copy(j.get(sfield, []))
- old_srecord = j.get(sfield, [])
- for name, regex in ssubs.iteritems():
- for i, v in enumerate(values):
- if v is not None and re.match(regex, v):
- # Found a match -- need to adjust the record
- # and replace with None in values
- values[i] = None
- if name in old_srecord:
- verbose(1, "Value %s is already in %s=%s" % (v, sfield, old_srecord))
- else:
- verbose(4, "Adding value %s for %s to %s" % (v, name, sfield))
- srecord.append(name)
- if sfield == 'ignore':
- # unhandled[v] = unhandled.get(v, 0) + 1
- pass
- else:
- refreshed[name] = refreshed.get(name, 0) + 1
- values = [v for v in values if v is not None]
- if sfield == 'ignore':
- verbose(4, "Skipping ignore")
- continue
- if srecord != old_srecord:
- verbose(4, "Adjusting %s to %s" % (old_srecord, srecord))
- j[sfield] = srecord
- if len(values):
- verbose(4, "Left unhandled: %s" % (values,))
- for v in values:
- unhandled[v] = unhandled.get(v, 0) + 1
- verbose(3, "Storing file %s" % fname)
- json.dump(j, open(os.path.join(dataout, fname), 'w'), indent=2)
- #open(os.path.join(dataout, fname), 'w').write(json.write(j))
-
-bad_ips = [x for x in ips.items() if len(x[1])>1]
-
-def ppd(d):
- keys = sorted(d.keys())
- return '\n '.join(["%s: %d" % (k, d[k]) for k in keys])
-
-verbose(1, "=== Refreshed ===\n %s" % ppd(refreshed))
-verbose(1, "=== Unhandled ===\n %s" % ppd(unhandled))
-verbose(1, "=== Skipped: %d, %d out of %d unique IP/Agent" % (skipped, len(ips), nwith_ips))