From 989bea269e7171eac0c6e7176ba6cb393efe432b Mon Sep 17 00:00:00 2001 From: rgao Date: Mon, 16 Dec 2019 19:21:35 -0800 Subject: [PATCH 1/2] cleaned up empty files and deprecated code in dataAnalysis folder; kept utils.py as it's referenced by other files --- .../ETL/__pycache__/utils.cpython-36.pyc | Bin 2002 -> 0 bytes dataAnalysis/ETL/extract_clean.py | 18 ------ dataAnalysis/ETL/transform.py | 0 dataAnalysis/ETL/utils.py | 46 ---------------- dataAnalysis/Karlencleaning.ipynb | 4 +- dataAnalysis/{ETL/clustering.py => utils.py} | 52 ++++++++++++++++-- 6 files changed, 48 insertions(+), 72 deletions(-) delete mode 100644 dataAnalysis/ETL/__pycache__/utils.cpython-36.pyc delete mode 100755 dataAnalysis/ETL/extract_clean.py delete mode 100755 dataAnalysis/ETL/transform.py delete mode 100755 dataAnalysis/ETL/utils.py rename dataAnalysis/{ETL/clustering.py => utils.py} (76%) mode change 100644 => 100755 diff --git a/dataAnalysis/ETL/__pycache__/utils.cpython-36.pyc b/dataAnalysis/ETL/__pycache__/utils.cpython-36.pyc deleted file mode 100644 index 843de0c7eff4103831bed75576b4816cb22bd6a7..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2002 zcmZ`)OK;mo5auqQ*29*a#IV~U4T?U(G>Poy(xOJ;!tN~)0ysed0f9hhDVs7yvP(L# zTKS}2^F#WJ3M|l5PW=JB_|$Kf6eBqa1G{qtCSFeG{6;*2L;tjMmK9o;ZC6V@ue78huwd!iDyPC)S|XL_;*88=@uJ z&`q%}HlSOgBQ~MiVoThDUKi}d@NfSMk4DdD$`e5m7SSZc*nE9n3>mH9A#LG3!kceH z%#5*7urU{G&Y(@f-*6md_LQ~%!``0HRij7;GR)#&7%KZ!5MPA8rQF2~&xC4X$HJj9 zd%<&fTqrCge z^QYa5B8v0J*;L&bMR7dP-p|rlgmOS(xfDKKw87*$WR1^+QRCZ#A(-I86xOMA#V+s8 z*s<|)`-&IFn9YnC-!W-0rF7kqmKv;uu0TUu1->GEPGIDjJ4*rEwIe zdAZA{&?gYZq+gi@|32E#PYl?kbDq7?1L#s&tt17WZhYcT!s3J_#gNz?%bD?lD- zAYWYZ5=2FzT~X+~(eq27+@9RP8L~rSlf)JYx+#QCek29rTas?}9TrJ3R)FL!TDnbw zXnhAp?S7CW0#8ntA>M&K)VyIpT-6XC!??V!VclDWRhVOI4)`@Fr(%ltR${l8VyCbr z`B7$)zFsh55gi{yu=5^*G&xP9YZXMp6?H%4pO(YcK$o`9ewS}iSpzkj@8<8l&=WmB2eujErNKE zhr=`xx#CGiRfan6+d@R6(M>)YdiFaJO!HlMst9PX4wrXfuOO%ic!-q(?SOLVoXgIP z;sy%A#ul1{O=UFabF_&X$?xtn+)?qz)irx~&F=Y*vWjVjE-{MJppbMq%Ek~eJ1F=G z<|w_;9awfV<~8(?j84lx$Cf&0EH1x>y%KVL=|m3ouQ4kS*bF&xP+l_cDeq}I42mdC zK>jz_r|clh;;BB(XSzafhTtNE(=eTUMFCXJ#s8>tQ8`BdYzJYGbBs}WmCd!S=k&U% zZ6tGnI>mlV+fHrU+IDN(sld2Bwdge9S~V9fh>Xp$1uaeWH>$3L<-svBpx;?qEoZjh z`%2IIUfC6Nr_fE}0Wiw@Bq(p>w Date: Tue, 17 Dec 2019 18:10:02 -0800 Subject: [PATCH 2/2] combined all deprecated dataAnalysis code into utils.py, and added docstrings to functions --- dataAnalysis/dataCleaning.py | 52 ------------------------------ dataAnalysis/utils.py | 62 ++++++++++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+), 52 deletions(-) delete mode 100644 dataAnalysis/dataCleaning.py diff --git a/dataAnalysis/dataCleaning.py b/dataAnalysis/dataCleaning.py deleted file mode 100644 index 2fa4fb4e7..000000000 --- a/dataAnalysis/dataCleaning.py +++ /dev/null @@ -1,52 +0,0 @@ -import pandas as pd - -''' -Data pipeline for ingestion of 311-data datasets -General sections: -1. ACQUIRE: Download data from source -2. CLEAN: Perform data cleaning and organization before entering into SQL -3. INGEST: Add data set to SQL database - -These workflows can be abstracted/encapsulated in order to better generalize -across tasks if necessary. -''' - -### 1. ACQUIRE ### -# Code for automated data download goes here - - -### 2. CLEAN ### - -# Load data file from TSV/CSV -### xNOTE: Can encapsulate this workflow and reapply for each data set -dfb = pd.read_table('311data2019.tsv',sep='\t') # For now assume data in this folder - -# Format dates as datetime (Time intensive) -dfb['CreatedDate'] = pd.to_datetime(dfb['CreatedDate']) -dfb['ClosedDate'] = pd.to_datetime(dfb['ClosedDate']) -dfb['ServiceDate'] = pd.to_datetime(dfb['ServiceDate']) - -# Compute service time -# New columns: closed_created, service_created -dfb['closed_created'] = dfb.ClosedDate-dfb.CreatedDate -dfb['service_created'] = dfb.ServiceDate-dfb.CreatedDate - -# drop NA values and reformat closed_created in units of hours -dfb = dfb[~dfb.closed_created.isna()] - -# New column: closed_created in units of days -dfb['closed_createdD'] = dfb.closed_created / pd.Timedelta(days=1) - -# xFUTURE: Geolocation/time clustering to weed out repeat requests -# xFUTURE: Decide whether ServiceDate or ClosedDate are primary metric -# xFUTURE: Removal of feedback and other categories - -# Save output file -# xFUTURE: May not be necessary after SQL database established -dfb.to_pickle('311data-cleaned.gzip') - -# xNote: To open: pd.read_pickle('311data-cleaned.gzip') - -### 3. INGEST ### -# Code for addition to SQL database goes here - diff --git a/dataAnalysis/utils.py b/dataAnalysis/utils.py index 0373a9445..9f4ed57bd 100755 --- a/dataAnalysis/utils.py +++ b/dataAnalysis/utils.py @@ -5,24 +5,39 @@ from shapely import wkt def fill_placeholder_1900(df): + """ + Replace all NaT entries with the year 1900 + """ return df.replace(to_replace=pd.to_datetime('1900'),value=pd.NaT) def to_datetime(df): + """ + Convert columns to pandas datetime format + """ dt_cols = ['CreatedDate','UpdatedDate','ServiceDate','ClosedDate'] for col in dt_cols: df[col] = pd.to_datetime(df[col]) def fill_placeholder_1900_col(df): + """ + Replace specific NaT entries with the year 1900 + """ dt_cols = ['CreatedDate','UpdatedDate','ServiceDate','ClosedDate'] for col in dt_cols: df[col] = df[col].replace(to_replace=pd.to_datetime('1900'),value=pd.NaT) def fill_placeholder_ongoing(df, cols): + """ + Replace ongoing request NaT entries with the year 1900 + """ for col in cols: df[col] = df[col].replace(to_replace=pd.NaT, value=datetime.now()) # df.loc[df[col] == 'NaT', col] = datetime.now() def ddiff2days(ddiff): + """ + Convert datetime data to float in number of days + """ if not pd.isnull(ddiff): return pd.Timedelta.total_seconds(ddiff)/(24.*3600) else: @@ -37,7 +52,54 @@ def to_points(p): def to_geom(df): df['Location'] = df.Location.apply(to_points) +### --- Initial efforts on data cleanup --- + +### 1. ACQUIRE ### +# Code for automated data download goes here + + +### 2. CLEAN ### + +# Load data file from TSV/CSV +### xNOTE: Can encapsulate this workflow and reapply for each data set +dfb = pd.read_table('311data2019.tsv',sep='\t') # For now assume data in this folder + +# Format dates as datetime (Time intensive) +dfb['CreatedDate'] = pd.to_datetime(dfb['CreatedDate']) +dfb['ClosedDate'] = pd.to_datetime(dfb['ClosedDate']) +dfb['ServiceDate'] = pd.to_datetime(dfb['ServiceDate']) + +# Compute service time +# New columns: closed_created, service_created +dfb['closed_created'] = dfb.ClosedDate-dfb.CreatedDate +dfb['service_created'] = dfb.ServiceDate-dfb.CreatedDate + +# drop NA values and reformat closed_created in units of hours +dfb = dfb[~dfb.closed_created.isna()] + +# New column: closed_created in units of days +dfb['closed_createdD'] = dfb.closed_created / pd.Timedelta(days=1) + +# xFUTURE: Geolocation/time clustering to weed out repeat requests +# xFUTURE: Decide whether ServiceDate or ClosedDate are primary metric +# xFUTURE: Removal of feedback and other categories + +# Save output file +# xFUTURE: May not be necessary after SQL database established +dfb.to_pickle('311data-cleaned.gzip') + +# xNote: To open: pd.read_pickle('311data-cleaned.gzip') + +### 3. INGEST ### +# Code for addition to SQL database goes here + +# ------ + def add_datediff_cols(df): + """ + Create new columns in database + Not recommended for final product, but useful for experimentation + """ df['ClosedDiff'] = df.ClosedDate - df.CreatedDate df['ServiceDiff'] = df.ServiceDate - df.CreatedDate df['ClosedServiceDiff'] = df.ClosedDate - df.ServiceDate