Skip to content

Commit

Permalink
Changes to introduce more serialization
Browse files Browse the repository at this point in the history
  • Loading branch information
gregpawin committed Mar 18, 2022
1 parent 2d3114f commit 5b89995
Show file tree
Hide file tree
Showing 4 changed files with 265 additions and 4 deletions.
4 changes: 4 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,10 @@ upload_geojson:
sample:
$(PYTHON_INTERPRETER) src/data/sample.py $(frac) $(cleaned)

## Make dataset
serial_data:
$(PYTHON_INTERPRETER) src/data/make_serial_data.py data/processed

## Set up python interpreter environment
create_environment:
ifeq (True,$(HAS_CONDA))
Expand Down
11 changes: 11 additions & 0 deletions src/data/make_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,9 @@ def create_sample(
/ (target_file.stem + "_" + str(sample_frac).replace(".", "") + "samp.csv")
)

# Save sampled file path as SAMPLED_DATA_FILEPATH into .env
set_key(find_dotenv(), "SAMPLED_DATA_FILEPATH", str(SAMPLED_DATA_FILEPATH))

print(f"Creating {sample_frac * 100}% sample")

# Read raw data and skiprows using random.random()
Expand Down Expand Up @@ -271,6 +274,14 @@ def clean(target_file: Union[Path, str], output_filedir: str, geojson=False):
# df.reset_index(drop=True,inplace=True)
# df.drop('Ticket number', axis=1)

# Filter out bad coordinates
df = df[
(df['latitude'] < 34.5) &
(df['latitude'] > 33.5) &
(df['longitude'] < -118) &
(df['longitude'] > -118.75)
].reset_index(drop=True)

# Extract weekday and add as column
df["weekday"] = df.datetime.dt.weekday.astype(str).replace(
{
Expand Down
248 changes: 248 additions & 0 deletions src/data/make_serial_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,248 @@
# -*- coding: utf-8 -*-
import click
from pathlib import Path
from dotenv import find_dotenv, load_dotenv, set_key
import urllib3
import shutil
import os
import csv
from datetime import date
import pandas as pd
import random
from pyproj import Transformer
from typing import Union
import geopandas as gpd
from shapely.geometry import Point

# Load project directory
PROJECT_DIR = Path(__file__).resolve().parents[2]


@click.command()
@click.argument("output_filedir", type=click.Path())
def main(output_filedir: str):
"""Cleans and serializes more of the dataset to speed uploading time"""
# If run as main, data is downloaded, 10% sampled, and cleaned
# If fails, it samples already downloaded data and samples at 1%, then cleans

try:
clean(
SAMPLED_DATA_FILEPATH,
output_filedir
)
except:
print("Failed")


def clean(target_file: Union[Path, str], output_filedir: str, geojson=False):
"""Removes unnecessary columns, erroneous data points and aliases,
changes geometry projection from epsg:2229 to epsg:4326, and converts
time to datetime type.
"""
# Change str filepath into Path
if isinstance(target_file, str):
target_file = Path(target_file)

print("Cleaning dataset")

# Read file into dataframe
df = pd.read_csv(target_file, low_memory=False)

# Select columns of interest
df = df[
[
"Issue Date",
"Issue time",
"RP State Plate",
"Make",
"Body Style",
"Color",
"Location",
"Violation code",
"Violation Description",
"Fine amount",
"Latitude",
"Longitude",
]
]

# Filter out data points with bad coordinates
df = df[(df.Latitude != 99999) & (df.Longitude != 99999)]

# Filter out data points with no time/date stamps
df = df[
(df["Issue Date"].notna())
& (df["Issue time"].notna())
& (df["Fine amount"].notna())
]

# Convert Issue time and Issue Date strings into a combined datetime type
df["Issue time"] = df["Issue time"].apply(
lambda x: "0" * (4 - len(str(int(x)))) + str(int(x))
)
df["Datetime"] = pd.to_datetime(
df["Issue Date"] + " " + df["Issue time"], format="%m/%d/%Y %H%M"
)

# Drop original date/time columns
df = df.drop(["Issue Date", "Issue time"], axis=1)

# Make column names more coding friendly except for Lat/Lon
df.columns = [
"state_plate",
"make",
"body_style",
"color",
"location",
"violation_code",
"violation_description",
"fine_amount",
"Latitude",
"Longitude",
"datetime",
]

# Read in make aliases
make_df = pd.read_csv(PROJECT_DIR / "references/make.csv", delimiter=",")
make_df["alias"] = make_df.alias.apply(lambda x: x.split(","))

# Iterate over makes and replace aliases
for row in make_df.itertuples():
df = df.replace(row[2], row[1])

# Car makes to keep (Top 70 by count)
make_list = [
"Toyota",
"Honda",
"Ford",
"Nissan",
"Chevrolet",
"BMW",
"Mercedes Benz",
"Volkswagen",
"Hyundai",
"Dodge",
"Lexus",
"Kia",
"Jeep",
"Audi",
"Mazda",
"Other",
"GMC",
"Infinity",
"Chrysler",
"Subaru",
"Acura",
"Volvo",
"Land Rover",
"Mitsubishi",
"Cadillac",
"Mini",
"Porsche",
"Unknown",
"Buick",
"Freightliner",
"Tesla",
"Lincoln",
"Saturn",
"Pontiac",
"Grumman",
"Fiat",
"Jaguar",
"Mercury",
"Isuzu",
"International",
"Suzuki",
"Saab",
"Oldsmobile",
"Maserati",
"Peterbuilt",
"Kenworth",
"Smart",
"Plymouth",
"Hino",
"Harley-Davidson",
"Alfa Romero",
"Hummer",
"Bentley",
"Yamaha",
"Kawasaki",
"Geo Metro",
"Winnebago",
"Rolls-Royce",
"Scion",
"Triumph",
"Checker",
"Datsun",
"Ferrari",
"Sterling",
"Lamborghini",
"Aston Martin",
"Daewoo",
"Merkur",
"Mack",
"CIMC",
]

# Turn all other makes into "MISC."
df.loc[~df.make.isin(make_list), "make"] = "MISC."
make_list.append("MISC.")

# Enumerate list of car makes and replace with keys
make_dict = {make: ind for ind, make in enumerate(make_list)}
df["make_ind"] = df.make.replace(make_dict)

# Instantiate projection converter and change projection
transformer = Transformer.from_crs("EPSG:2229", "EPSG:4326")
df["latitude"], df["longitude"] = transformer.transform(
df["Latitude"].values, df["Longitude"].values
)

# Drop original coordinate columns
df = df.drop(["Latitude", "Longitude", "make"], axis=1)
# df.reset_index(drop=True,inplace=True)
# df.drop('Ticket number', axis=1)

# Filter out bad coordinates
df = df[
(df['latitude'] < 34.5) &
(df['latitude'] > 33.5) &
(df['longitude'] < -118) &
(df['longitude'] > -118.75)
].reset_index(drop=True)

# Extract weekday and add as column
df["weekday"] = df.datetime.dt.weekday.astype(str)
# Set fine amount as int
df["fine_amount"] = df.fine_amount.astype(int)

# Drop filtered index and add new one
df.reset_index(drop=True, inplace=True)

df.to_csv(
PROJECT_DIR
/ output_filedir
/ (target_file.stem.replace("_raw", "_serialized") + ".csv"),
index=False,
quoting=csv.QUOTE_ALL,
)

return print("Saved as csv!")


if __name__ == "__main__":
# log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
# logging.basicConfig(level=logging.INFO, format=log_fmt)

# find .env automagically by walking up directories until it's found, then
# load up the .env entries as environment variables
load_dotenv(find_dotenv())
SAMPLED_DATA_FILEPATH = os.environ["SAMPLED_DATA_FILEPATH"]

# Run main function
# logger = logging.getLogger(__name__)
# logger.info(
# 'Starting download of raw dataset: this will take a few minutes'
# )
main()
# logger.info('Finished downloading!')
6 changes: 2 additions & 4 deletions src/data/sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,14 @@
from dotenv import find_dotenv, load_dotenv
import os
from make_dataset import clean, create_sample
import platform


@click.command()
@click.argument("frac", type=click.FLOAT)
@click.argument("cleaned", type=click.BOOL)
def main(frac: float, cleaned: bool):
"""Downloads full dataset from lacity.org, and runs data processing
scripts to turn raw data into cleaned data ready
to be analyzed. Also updates environmental
variable RAW_DATA_FILEPATH.
"""Resamples and cleanes dataset using last download raw dataset.
"""
if cleaned:
clean(
Expand Down

0 comments on commit 5b89995

Please sign in to comment.