hackforla · sellnat77 · Dec 18, 2019 · Dec 17, 2019 · Dec 18, 2019
diff --git a/dataAnalysis/ETL/__pycache__/utils.cpython-36.pyc b/dataAnalysis/ETL/__pycache__/utils.cpython-36.pyc
diff --git a/dataAnalysis/ETL/extract_clean.py b/dataAnalysis/ETL/extract_clean.py
diff --git a/dataAnalysis/ETL/transform.py b/dataAnalysis/ETL/transform.py
diff --git a/dataAnalysis/ETL/utils.py b/dataAnalysis/ETL/utils.py
diff --git a/dataAnalysis/Karlencleaning.ipynb b/dataAnalysis/Karlencleaning.ipynb
@@ -13,7 +13,7 @@
     "from shapely.geometry import LineString, Polygon, Point\n",
     "from shapely import wkt\n",
     "\n",
-    "from ETL import utils\n",
+    "import utils\n",
     "\n",
     "import matplotlib.pyplot as plt\n",
     "%matplotlib inline"
@@ -320,4 +320,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 2
-}
+}
diff --git a/dataAnalysis/dataCleaning.py b/dataAnalysis/dataCleaning.py
diff --git a/dataAnalysis/ETL/clustering.py → dataAnalysis/utils.py b/dataAnalysis/ETL/clustering.py → dataAnalysis/utils.py
@@ -1,6 +1,111 @@
-import numpy as np 
-import random as rd
+import numpy as np
 import pandas as pd
+import random as rd
+from datetime import datetime
+from shapely import wkt
+
+def fill_placeholder_1900(df):
+    """
+    Replace all NaT entries with the year 1900
+    """
+    return df.replace(to_replace=pd.to_datetime('1900'),value=pd.NaT)
+
+def to_datetime(df):
+    """
+    Convert columns to pandas datetime format
+    """
+    dt_cols = ['CreatedDate','UpdatedDate','ServiceDate','ClosedDate']
+    for col in dt_cols:
+        df[col] = pd.to_datetime(df[col])
+
+def fill_placeholder_1900_col(df):
+    """
+    Replace specific NaT entries with the year 1900
+    """
+    dt_cols = ['CreatedDate','UpdatedDate','ServiceDate','ClosedDate']
+    for col in dt_cols:
+        df[col] = df[col].replace(to_replace=pd.to_datetime('1900'),value=pd.NaT)
+
+def fill_placeholder_ongoing(df, cols):
+    """
+    Replace ongoing request NaT entries with the year 1900
+    """
+    for col in cols:
+        df[col] = df[col].replace(to_replace=pd.NaT, value=datetime.now())
+        # df.loc[df[col] == 'NaT', col] = datetime.now()
+
+def ddiff2days(ddiff):
+    """
+    Convert datetime data to float in number of days
+    """
+    if not pd.isnull(ddiff):
+        return pd.Timedelta.total_seconds(ddiff)/(24.*3600)
+    else:
+        return np.NaN
+
+def to_points(p):
+    if type(p) == float:
+        return p
+    else:
+        return wkt.loads('Point{}'.format(p.replace(',',' ')))
+
+def to_geom(df):
+    df['Location'] = df.Location.apply(to_points)
+
+### --- Initial efforts on data cleanup ---
+
+### 1. ACQUIRE ###
+# Code for automated data download goes here
+
+
+### 2. CLEAN ###
+
+# Load data file from TSV/CSV
+### xNOTE: Can encapsulate this workflow and reapply for each data set
+dfb = pd.read_table('311data2019.tsv',sep='\t') # For now assume data in this folder
+
+# Format dates as datetime (Time intensive)
+dfb['CreatedDate'] = pd.to_datetime(dfb['CreatedDate'])
+dfb['ClosedDate'] = pd.to_datetime(dfb['ClosedDate'])
+dfb['ServiceDate'] = pd.to_datetime(dfb['ServiceDate'])
+
+# Compute service time
+# New columns: closed_created, service_created
+dfb['closed_created'] = dfb.ClosedDate-dfb.CreatedDate
+dfb['service_created'] = dfb.ServiceDate-dfb.CreatedDate
+
+# drop NA values and reformat closed_created in units of hours
+dfb = dfb[~dfb.closed_created.isna()]
+
+# New column: closed_created in units of days 
+dfb['closed_createdD'] = dfb.closed_created / pd.Timedelta(days=1)
+
+# xFUTURE: Geolocation/time clustering to weed out repeat requests
+# xFUTURE: Decide whether ServiceDate or ClosedDate are primary metric
+# xFUTURE: Removal of feedback and other categories
+
+# Save output file 
+# xFUTURE: May not be necessary after SQL database established
+dfb.to_pickle('311data-cleaned.gzip')
+
+# xNote: To open: pd.read_pickle('311data-cleaned.gzip')
+
+### 3. INGEST ###
+# Code for addition to SQL database goes here
+
+# ------
+
+def add_datediff_cols(df):
+    """
+    Create new columns in database
+    Not recommended for final product, but useful for experimentation
+    """
+    df['ClosedDiff'] = df.ClosedDate - df.CreatedDate
+    df['ServiceDiff'] = df.ServiceDate - df.CreatedDate
+    df['ClosedServiceDiff'] = df.ClosedDate - df.ServiceDate
+    df['ClosedDiff_Days'] = df.ClosedDiff.apply(ddiff2days)
+    df['ServiceDiff_Days'] = df.ServiceDiff.apply(ddiff2days)
+    df['ClosedServiceDiff_Days'] = df.ClosedServiceDiff.apply(ddiff2days)
 
 def combine_coor(dataset):
     """
@@ -179,7 +284,4 @@ def run_kmeans(dataset, k, manual=False, reps=[], t=1000):
 
 #         z_min.append(data[np.argmin(sum_dist)])
 
-#     print(z_min)
-
-
-
+#     print(z_min)