#!/usr/bin/python
#emacs: -*- mode: python-mode; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*- 
#ex: set sts=4 ts=4 sw=4 noet:
#------------------------- =+- Python script -+= -------------------------
"""
 @file      postprocdata.py
 @date      Tue May 24 10:28:28 2011
 @brief


  Yaroslav Halchenko                                            Dartmouth
  web:     http://www.onerussian.com                              College
  e-mail:  yoh@onerussian.com                              ICQ#: 60653192

 DESCRIPTION (NOTES):

 COPYRIGHT: Yaroslav Halchenko 2011

 LICENSE: MIT

  Permission is hereby granted, free of charge, to any person obtaining a copy
  of this software and associated documentation files (the "Software"), to deal
  in the Software without restriction, including without limitation the rights
  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  copies of the Software, and to permit persons to whom the Software is
  furnished to do so, subject to the following conditions:

  The above copyright notice and this permission notice shall be included in
  all copies or substantial portions of the Software.

  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  THE SOFTWARE.
"""
#-----------------\____________________________________/------------------

__author__ = 'Yaroslav Halchenko'
__revision__ = '$Revision: $'
__date__ = '$Date:  $'
__copyright__ = 'Copyright (c) 2011 Yaroslav Halchenko'
__license__ = 'GPL'


import os, sys, glob, json, re, shutil
from copy import copy
from mvpa.base import verbose
from common import *

verbose.level = 2
datain = 'data'
dataout = 'dataout'
dataorig = 'dataorig'

blacklist = ['1305808539.9.json', '1305808540.1.json', '1305808541.03.json', # persistent and curious mind-ware guy from Israel
             ]


for d in dataout, dataorig:
    if os.path.exists(d):
        shutil.rmtree(d)
    os.makedirs(d)

def ndiffer(d1, d2, skip=['timestamp']):
    n = 0
    for key in d1.keys():
        if key in skip:
            continue
        if d1[key] != d2.get(key, 'XXX'):
            n += 1
    return n

ips = {}
nwith_ips = 0
unhandled = {}
refreshed = {}
infiles = glob.glob(os.path.join(datain, '*.json'))
skipped = 0
#infiles = glob.glob(os.path.join(datain, '1305741725.57.json'))
for f in infiles:
    fname = os.path.basename(f)
    if fname in blacklist:
        verbose(1, "Skipping %s because of blacklist" % f)
        skipped += 1
        continue
    verbose(5, "Loading %s" % f)
    j = json.load(open(f))
    if [j.get(x) for x in 'man_os', 'pers_os', 'virt_host_os', 'bg_datamod'] == \
       ['none', 'none', 'none', None]:
        verbose(1, "Skipping %s because all systems are nones" % f)
        skipped += 1
        continue

    if 'remote_addr' in j:
        nwith_ips += 1
        ip = j['remote_addr']
        agent = j.get('user_agent', None)
        previous_entries = ips.get((ip, agent), [])
        # Let's see if we catch results seekers -- check for how many
        # fields are identical
        if len(previous_entries):
            diffs = [ndiffer(x, j) for x in previous_entries]
            if min(diffs) < 2:
                verbose(1, "Skipping %s because there is a previous entry which differs only in %d fields" % (f, min(diffs),))
                skipped += 1
                continue
        ips[(ip, agent)] = previous_entries + [j]

    json.dump(j, open(os.path.join(dataorig, fname), 'w'), indent=2)
    for ofield, osubs in entries_to_refresh.iteritems():
        if not (ofield in j and j[ofield]):
            continue
        csv = j[ofield]
        values = [x.strip().lower() for x in re.split('[+,|;]', csv)]
        values = [v for v in values if len(v)]
        original_values = values[:]
        verbose(3, "Working on %s: %r" % (ofield, values))
        for sfield, ssubs in osubs.iteritems():
            srecord = copy(j.get(sfield, []))
            old_srecord = j.get(sfield, [])
            for name, (regex, isnew) in ssubs.iteritems():
                for i, v in enumerate(values):
                    if v is not None and re.match(regex, v):
                        # Found a match -- need to adjust the record
                        # and replace with None in values
                        values[i] = None
                        if name in old_srecord:
                            verbose(1, "Value %s is already in %s=%s" % (v, sfield, old_srecord))
                        else:
                            verbose(4, "Adding value %s for %s to %s" % (v, name, sfield))
                            srecord.append(name)
                        if sfield == 'ignore':
                            # unhandled[v] = unhandled.get(v, 0) + 1
                            pass
                        else:
                            refreshed[name] = refreshed.get(name, 0) + 1
            values = [v for v in values if v is not None]
            if sfield == 'ignore':
                verbose(4, "Skipping ignore")
                continue
            if srecord != old_srecord:
                verbose(4, "Adjusting %s to %s" % (old_srecord, srecord))
                j[sfield] = srecord
        if len(values):
            verbose(4, "Left unhandled: %s" % (values,))
            for v in values:
                unhandled[v] = unhandled.get(v, 0) + 1
    verbose(3, "Storing file %s" % fname)
    # shorten IP
    j['remote_addr'] = '.'.join(j['remote_addr'].split('.')[:2]) + '.x.x'
    json.dump(j, open(os.path.join(dataout, fname), 'w'), indent=2)
    #open(os.path.join(dataout, fname), 'w').write(json.write(j))

bad_ips = [x for x in ips.items() if len(x[1])>1]

def ppd(d):
    keys = sorted(d.keys())
    return '\n '.join(["%s: %d" % (k, d[k]) for k in keys])

verbose(1, "=== Refreshed ===\n %s" % ppd(refreshed))
verbose(1, "=== Unhandled ===\n %s" % ppd(unhandled))
verbose(1, "=== Skipped: %d, %d out of %d unique IP/Agent" % (skipped, len(ips), nwith_ips))