hackforla · adamkendis · May 21, 2020 · May 5, 2020 · May 21, 2020 · May 21, 2020
diff --git a/server/src/services/dataService.py b/server/src/services/dataService.py
@@ -1,6 +1,7 @@
 import pandas as pd
 from .databaseOrm import Ingest
 from utils.database import db
+from utils.picklebase import pb
 
 
 class DataService(object):
@@ -18,6 +19,12 @@ def standardFilters(self,
         '''
         Generates filters for dates, request types, and ncs.
         '''
+        if pb.enabled:
+            return {
+                'startDate': startDate,
+                'endDate': endDate,
+                'requestTypes': requestTypes,
+                'ncList': ncList}
 
         requestTypes = (', ').join([f"'{rt}'" for rt in requestTypes])
         ncList = (', ').join([str(nc) for nc in ncList])
@@ -37,6 +44,13 @@ def comparisonFilters(self,
         '''
         Generates filters for the comparison endpoints.
         '''
+        if pb.enabled:
+            return {
+                'startDate': startDate,
+                'endDate': endDate,
+                'requestTypes': requestTypes,
+                'ncList': ncList,
+                'cdList': cdList}
 
         requestTypes = (', ').join([f"'{rt}'" for rt in requestTypes])
         if len(ncList) > 0:
@@ -80,6 +94,9 @@ def query(self, fields, filters, table=default_table):
         if not fields or not filters:
             return {'Error': 'fields and filters are required'}
 
+        if pb.enabled:
+            return pb.query(table, fields, filters)
+
         fields = (', ').join(fields)
         return pd.read_sql(f"""
             SELECT {fields}

diff --git a/server/src/utils/picklebase/__init__.py b/server/src/utils/picklebase/__init__.py
@@ -0,0 +1,27 @@
+import os
+from .query import query as query_pb
+from .populate import populate as populate_pb
+
+
+class PickleBase(object):
+    def __init__(self):
+        self.enabled = False
+
+    def populate(self):
+        populate_pb()
+
+    def query(self, table, fields, filters):
+        return query_pb(table, fields, filters)
+
+
+pb = PickleBase()
+
+
+if int(os.environ.get('PICKLEBASE', 0)) == 1:
+    print('PICKLEBASE ENABLED')
+    try:
+        pb.populate()
+        pb.enabled = True
+    except Exception as e:
+        print('FAILED TO POPULATE PICKLEBASE')
+        print(e)
diff --git a/server/src/utils/picklebase/create_table.py b/server/src/utils/picklebase/create_table.py
@@ -0,0 +1,77 @@
+import pandas as pd
+from .data_access import init_table, save_batch, save_meta
+
+
+def load_batch(engine, from_table, with_fields, batch_size, batch_number):
+    return pd.read_sql(f"""
+        SELECT {(', ').join(with_fields)}
+        FROM {from_table}
+        ORDER BY createddate ASC
+        LIMIT {batch_size}
+        OFFSET {batch_size * batch_number}
+    """, engine)
+
+
+def commit_batch(table, batch_num, batch):
+    def to_megs(bytes):
+        return '{} MB'.format(round(bytes / 10**6, 2))
+
+    num_rows = len(batch)
+    startDate = batch.iloc[0]['createddate'].isoformat()
+    endDate = batch.iloc[-1]['createddate'].isoformat()
+    memory_size = batch.memory_usage(deep=True).sum()
+    filename, disk_size = save_batch(table, batch_num, batch)
+
+    print('\tSaved batch {}: {} rows'.format(batch_num, num_rows), flush=True)
+
+    return {
+        'filename': filename,
+        'sizeOnDisk': to_megs(disk_size),
+        'sizeInMemory': to_megs(memory_size),
+        'rows': num_rows,
+        'startDate': startDate,
+        'endDate': endDate}
+
+
+def create_table(table,
+                 from_table,
+                 with_fields,
+                 engine,
+                 batch_size,
+                 optimize=None):
+
+    print('\nCreating table: {}'.format(table), flush=True)
+    print('From table: {}'.format(from_table), flush=True)
+    print('With fields: {}'.format(with_fields), flush=True)
+
+    init_table(table)
+
+    batches = []
+    batch_num = 0
+    while True:
+        batch = load_batch(
+            engine,
+            from_table,
+            with_fields,
+            batch_size,
+            batch_num)
+
+        if len(batch) == 0:
+            break
+
+        if optimize is not None:
+            optimize(batch)
+
+        batch_meta = commit_batch(table, batch_num, batch)
+        batches.append(batch_meta)
+        batch_num += 1
+
+    meta = {
+        'table': table,
+        'from': from_table,
+        'fields': with_fields,
+        'totalRows': sum([batch['rows'] for batch in batches]),
+        'batches': batches}
+
+    save_meta(table, meta)
+    return meta
diff --git a/server/src/utils/picklebase/data_access.py b/server/src/utils/picklebase/data_access.py
@@ -0,0 +1,60 @@
+import os
+import pickle
+import shutil
+import json
+
+
+TMP_DIR = os.environ.get('TMP_DIR', os.getcwd())
+DATA_DIR = os.path.join(TMP_DIR, 'static/picklebase')
+
+
+def clear_data():
+    shutil.rmtree(DATA_DIR, ignore_errors=True)
+    os.makedirs(DATA_DIR, exist_ok=True)
+
+
+def table_path(table):
+    return os.path.join(DATA_DIR, table)
+
+
+def init_table(table):
+    path = table_path(table)
+    shutil.rmtree(path, ignore_errors=True)
+    os.makedirs(path, exist_ok=True)
+
+
+def batch_filename(batch_num):
+    return f'batch_{batch_num}'
+
+
+def save_batch(table, batch_num, batch):
+    filename = batch_filename(batch_num)
+    path = os.path.join(table_path(table), filename)
+    with open(path, 'wb') as f:
+        pickle.dump(batch, f)
+    return filename, os.path.getsize(path)
+
+
+def load_batch(table, batch_num):
+    path = os.path.join(table_path(table), batch_filename(batch_num))
+    with open(path, 'rb') as f:
+        return pickle.load(f)
+
+
+def meta_path(table):
+    return os.path.join(DATA_DIR, table, 'meta.json')
+
+
+def save_meta(table, meta):
+    path = meta_path(table)
+    meta_json = json.dumps(meta, indent=2)
+    print('\nSaving meta:', flush=True)
+    print(meta_json, flush=True)
+    with open(path, 'w') as f:
+        f.write(meta_json)
+
+
+def load_meta(table):
+    path = meta_path(table)
+    with open(path, 'r') as f:
+        return json.load(f)
diff --git a/server/src/utils/picklebase/populate.py b/server/src/utils/picklebase/populate.py
@@ -0,0 +1,57 @@
+import os
+from utils.database import db
+from .data_access import clear_data
+from .create_table import create_table
+
+
+BATCH_SIZE = int(os.environ.get('PICKLEBASE_BATCH_SIZE', 400000))
+
+
+def create_map_table():
+    def optimize(batch):
+        batch['nc'] = batch['nc'].astype('Int64').astype('category')
+        batch['requesttype'] = batch['requesttype'].astype('category')
+
+    create_table(
+        table='map',
+        from_table='map',
+        with_fields=[
+            'createddate',
+            'srnumber',
+            'requesttype',
+            'nc',
+            'latitude',
+            'longitude'
+        ],
+        engine=db.engine,
+        batch_size=BATCH_SIZE,
+        optimize=optimize)
+
+
+def create_vis_table():
+    def optimize(batch):
+        batch['nc'] = batch['nc'].astype('Int64').astype('category')
+        batch['cd'] = batch['cd'].astype('Int64').astype('category')
+        batch['requesttype'] = batch['requesttype'].astype('category')
+        batch['requestsource'] = batch['requestsource'].astype('category')
+
+    create_table(
+        table='vis',
+        from_table='vis',
+        with_fields=[
+            'createddate',
+            'requesttype',
+            'requestsource',
+            'nc',
+            'cd',
+            '_daystoclose'
+        ],
+        engine=db.engine,
+        batch_size=BATCH_SIZE,
+        optimize=optimize)
+
+
+def populate():
+    clear_data()
+    create_map_table()
+    create_vis_table()
diff --git a/server/src/utils/picklebase/query.py b/server/src/utils/picklebase/query.py
@@ -0,0 +1,46 @@
+import pandas as pd
+from .data_access import load_batch, load_meta
+
+
+def get_batch_nums(table, startDate, endDate):
+    batches = load_meta(table)['batches']
+
+    return [batch_num for batch_num, batch in enumerate(batches) if (
+        startDate <= pd.to_datetime(batch['endDate']) and
+        endDate >= pd.to_datetime(batch['startDate'])
+    )]
+
+
+def query(table, fields, filters):
+    startDate = pd.to_datetime(filters['startDate'])
+    endDate = pd.to_datetime(filters['endDate'])
+    requestTypes = filters['requestTypes']
+    ncList = filters.get('ncList')
+    cdList = filters.get('cdList')
+
+    batches = []
+    for batch_num in get_batch_nums(table, startDate, endDate):
+        df = load_batch(table, batch_num)
+
+        if len(ncList) > 0:
+            district_filter = df['nc'].isin(ncList)
+        else:
+            district_filter = df['cd'].isin(cdList)
+
+        batch = df.loc[(
+            (df['createddate'] > startDate) &
+            (df['createddate'] < endDate) &
+            df['requesttype'].isin(requestTypes) &
+            district_filter
+        ), fields]
+
+        batches.append(batch)
+
+    if len(batches) > 0:
+        all = pd.concat(batches, ignore_index=True)
+        for c in all.columns:
+            if hasattr(all[c], 'cat'):
+                all[c].cat.remove_unused_categories(inplace=True)
+        return all
+    else:
+        return pd.DataFrame(columns=fields)
diff --git a/server/src/utils/redis.py b/server/src/utils/redis.py
@@ -38,7 +38,8 @@ def set(self, key, value):
 
 
 class PickleCache(object):
-    CACHE_DIR = os.path.join(os.getcwd(), 'static/cache')
+    TMP_DIR = os.environ.get('TMP_DIR', os.getcwd())
+    CACHE_DIR = os.path.join(TMP_DIR, 'static/picklecache')
 
     def __init__(self):
         print('PICKLECACHE ENABLED')