#!/usr/bin/python
#emacs: -*- mode: python-mode; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*- 
#ex: set sts=4 ts=4 sw=4 noet:
#------------------------- =+- Python script -+= -------------------------
"""
 @file      postprocdata.py
 @date      Tue May 24 10:28:28 2011
 @brief


  Yaroslav Halchenko                                            Dartmouth
  web:     http://www.onerussian.com                              College
  e-mail:  yoh@onerussian.com                              ICQ#: 60653192

 DESCRIPTION (NOTES):

 COPYRIGHT: Yaroslav Halchenko 2011

 LICENSE: MIT

  Permission is hereby granted, free of charge, to any person obtaining a copy
  of this software and associated documentation files (the "Software"), to deal
  in the Software without restriction, including without limitation the rights
  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  copies of the Software, and to permit persons to whom the Software is
  furnished to do so, subject to the following conditions:

  The above copyright notice and this permission notice shall be included in
  all copies or substantial portions of the Software.

  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  THE SOFTWARE.
"""
#-----------------\____________________________________/------------------

__author__ = 'Yaroslav Halchenko'
__revision__ = '$Revision: $'
__date__ = '$Date:  $'
__copyright__ = 'Copyright (c) 2011 Yaroslav Halchenko'
__license__ = 'GPL'


import os, sys, glob, json, re, shutil
from copy import copy
from mvpa.base import verbose
verbose.level = 2
datain = 'data'
dataout = 'dataout'
dataorig = 'dataorig'

blacklist = ['1305808539.9.json', '1305808540.1.json', '1305808541.03.json', # persistent and curious mind-ware guy from Israel
             ]

all_subs = dict(
    sw_other_name=dict(
        sw_electro=dict(
            cedspike='ced *spike2*',                        # NEW: http://www.ced.co.uk/pru.shtml?spk4wglu.htm
            datariver='exp control: datariver',             # NEW: http://sccn.ucsd.edu/wiki/DataSuite
            eeglab='(eeglab|http://sccn.ucsd.edu/eeglab/)',
            emse='emse',                                    # REFRESH
            erplab='erplab',                                # NEW:     ERPLAB
            klusters='klusters.*',                          # REFRESH
            netstation='egi net station',                   # NEW:     EGI Net Station
            neuroscan='(curry|neuroscan(| curry))',         # REFRESH
            neuroscope='.*neuroscope',                      # REFRESH
            nutmeg='.*nutmeg',                              # NEW
            ),
        sw_img=dict(
            mricron='mricrogl',
            afni='afni for bci',
            dtistudio='dti-*studio',    # NEW: or MRIStudio?
            brainsight='brainsight',    # NEW: BrainSight
            nordicice='nordic ice',     # NEW: NordicICE  -- just 1
            trackvis='trackvis',
            xmedcon='xmedcon',          # NEW
            ),
        sw_general=dict(
            lua='lua',                  # NEW
            stata='stata',              # NEW
            statistica='statistica',    # NEW
            java='java',                # REFRESH
            ),
        sw_neusys=dict(
            neuroml='neuroml',          # NEW: NeuroML -- more of a framework/standard than software
            xpp='xpp(|y|aut)',        # REFRESH: XPP/XPPAUT and Python interface
            ),
        sw_psychphys=dict(
            asf='asf',                  # NEW: ASF  http://code.google.com/p/asf/
            cogent='cogent(|2000)',     # REFRESH
            crsvsg='crs toolbox.*',     # NEW: CRS VSG Toolbox  http://www.crsltd.com/catalog/vsgtoolbox/
            mindware='mind-ware',       # NEW: MindWare
            nordicaktiva='nordic aktiva', # NEW:    NordicActiva  -- just 1 http://www.nordicneurolab.com/Products_and_Solutions/Clinical_Software_Solutions/nordicActiva.aspx  http://www.nordicneurolab.com/Products_and_Solutions/Clinical_Software_Solutions/nordicAktiva.aspx
            superlab='superlab',        # REFRESH
            psignifit='psignifit(|3)',  # NEW
            ),
        ignore=dict(ignore=
                    '(zsh vim mutt git'
                    # just ignore
                    '|my overall time.*|separate work.*|60% windows'
                    '|.*my own .*software'
                    # Different generic visualization solutions
                    '|gnupot|.*gnu plot.*xmgrace|mayavi|matplotlib'
                    '|trackvis'
                    '|opengl|itk|vtk'
                    '|paraview'
                    # Really cool one for graphs
                    '|gephi'
                    # Generic DBs
                    '|mysql|postgresql'
                    # DB with imaging data (Italy?) but just once
                    '|loris multi-site database system'
                    # More languages/platforms?
                    '|.net|haskel|gsl|cuda'
                    # Python lovers
                    '|theano|pygame|numpy|mdp|joblib|scipy|pytables|sympy'
                    # ML toolboxes
                    '|scikits-learn|probid .*'
                    # Reference managers
                    '|mendeley|jabref'
                    # Python IDE?? quite nice btw
                    '|spyder'
                    # Move into survey?
                    '|.*magnetic source locator.*' # Some kind of MEG inverse solver -- publications but no public project
                    ')'
                    ),
        ),
    )

for d in dataout, dataorig:
    if os.path.exists(d):
        shutil.rmtree(d)
    os.makedirs(d)

def ndiffer(d1, d2, skip=['timestamp']):
    n = 0
    for key in d1.keys():
        if key in skip:
            continue
        if d1[key] != d2.get(key, 'XXX'):
            n += 1
    return n

ips = {}
nwith_ips = 0
unhandled = {}
refreshed = {}
infiles = glob.glob(os.path.join(datain, '*.json'))
skipped = 0
#infiles = glob.glob(os.path.join(datain, '1305741725.57.json'))
for f in infiles:
    fname = os.path.basename(f)
    if fname in blacklist:
        verbose(1, "Skipping %s because of blacklist" % f)
        skipped += 1
        continue
    verbose(5, "Loading %s" % f)
    j = json.load(open(f))
    if [j.get(x) for x in 'man_os', 'pers_os', 'virt_host_os', 'bg_datamod'] == \
       ['none', 'none', 'none', None]:
        verbose(1, "Skipping %s because all systems are nones" % f)
        skipped += 1
        continue

    if 'remote_addr' in j:
        nwith_ips += 1
        ip = j['remote_addr']
        agent = j.get('user_agent', None)
        previous_entries = ips.get((ip, agent), [])
        # Let's see if we catch results seekers -- check for how many
        # fields are identical
        if len(previous_entries):
            diffs = [ndiffer(x, j) for x in previous_entries]
            if min(diffs) < 2:
                verbose(1, "Skipping %s because there is a previous entry which differs only in %d fields" % (f, min(diffs),))
                skipped += 1
                continue
        ips[(ip, agent)] = previous_entries + [j]

    json.dump(j, open(os.path.join(dataorig, fname), 'w'), indent=2)
    for ofield, osubs in all_subs.iteritems():
        if not (ofield in j and j[ofield]):
            continue
        csv = j[ofield]
        values = [x.strip().lower() for x in re.split('[+,|;]', csv)]
        values = [v for v in values if len(v)]
        original_values = values[:]
        verbose(3, "Working on %s: %r" % (ofield, values))
        for sfield, ssubs in osubs.iteritems():
            srecord = copy(j.get(sfield, []))
            old_srecord = j.get(sfield, [])
            for name, regex in ssubs.iteritems():
                for i, v in enumerate(values):
                    if v is not None and re.match(regex, v):
                        # Found a match -- need to adjust the record
                        # and replace with None in values
                        values[i] = None
                        if name in old_srecord:
                            verbose(1, "Value %s is already in %s=%s" % (v, sfield, old_srecord))
                        else:
                            verbose(4, "Adding value %s for %s to %s" % (v, name, sfield))
                            srecord.append(name)
                        if sfield == 'ignore':
                            # unhandled[v] = unhandled.get(v, 0) + 1
                            pass
                        else:
                            refreshed[name] = refreshed.get(name, 0) + 1
            values = [v for v in values if v is not None]
            if sfield == 'ignore':
                verbose(4, "Skipping ignore")
                continue
            if srecord != old_srecord:
                verbose(4, "Adjusting %s to %s" % (old_srecord, srecord))
                j[sfield] = srecord
        if len(values):
            verbose(4, "Left unhandled: %s" % (values,))
            for v in values:
                unhandled[v] = unhandled.get(v, 0) + 1
    verbose(3, "Storing file %s" % fname)
    json.dump(j, open(os.path.join(dataout, fname), 'w'), indent=2)
    #open(os.path.join(dataout, fname), 'w').write(json.write(j))

bad_ips = [x for x in ips.items() if len(x[1])>1]

def ppd(d):
    keys = sorted(d.keys())
    return '\n '.join(["%s: %d" % (k, d[k]) for k in keys])

verbose(1, "=== Refreshed ===\n %s" % ppd(refreshed))
verbose(1, "=== Unhandled ===\n %s" % ppd(unhandled))
verbose(1, "=== Skipped: %d, %d out of %d unique IP/Agent" % (skipped, len(ips), nwith_ips))