-
Notifications
You must be signed in to change notification settings - Fork 0
/
analysis.py
76 lines (57 loc) · 2.85 KB
/
analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import pandas as pd
# Read the data
permits_df = pd.read_csv("data/processed/data_parking_permits_and_spaces.csv")
# drop rows where Oblast contains a dot
permits_df = permits_df[~permits_df["Oblast"].str.contains("\.")]
# drop rows where XSUM is empty
permits_df = permits_df[permits_df["XSUM"].notnull()]
# drop rows where Oblast isn't between P01 and P10
permits_df = permits_df[
permits_df["Oblast"].str.contains("P0[1-3]")
# | permits_df["Oblast"].str.contains("P10")
]
# find the oldest and most recent date common to all oblasts
# so for each Oblast, find the oldest and most recent date, then find the newest of the oldest dates and the oldest of the newest dates
oblast_dates = {}
for oblast in permits_df["Oblast"].unique():
oblast_df = permits_df[permits_df["Oblast"] == oblast]
oblast_dates[oblast] = (oblast_df["date"].min(), oblast_df["date"].max())
# find the newest of the oldest dates and the oldest of the newest dates
oldest_date = max([x[0] for x in oblast_dates.values()])
newest_date = min([x[1] for x in oblast_dates.values()])
print(f"Oldest date: {oldest_date}")
print(f"Newest date: {newest_date}")
# drop rows where date is outside of the range
permits_df = permits_df[
(permits_df["date"] >= oldest_date) & (permits_df["date"] <= newest_date)
]
# get a list of all columns starting with POP_
# pop_columns = [x for x in permits_df.columns if x.startswith("POP_")]
pop_columns = ["POP_R", "POP_A", "POP_P"]
# now for each Oblast, compare the number of POP_R in the oldest and newest row (date) and print the difference in percentage
for oblast in permits_df["Oblast"].unique():
print(f"Oblast: {oblast}")
oblast_df = permits_df[permits_df["Oblast"] == oblast]
for pop_column in pop_columns:
oldest_row = oblast_df[oblast_df["date"] == oldest_date]
newest_row = oblast_df[oblast_df["date"] == newest_date]
oldest_pop = oldest_row[pop_column].values[0]
oldest_xsum = oldest_row["XSUM"].values[0]
newest_pop = newest_row[pop_column].values[0]
newest_xsum = newest_row["XSUM"].values[0]
diff = (newest_pop / newest_xsum) - (oldest_pop / oldest_xsum)
print(
f"{pop_column}: {diff:.2%} ({oldest_pop / oldest_xsum:.2%} -> {newest_pop / newest_xsum:.2%}) ({oldest_pop} -> {newest_pop})"
)
# forecast the number of permits for 3 years from now
# now for XSUM as a ratio of RES_PS + MIX_PS
ratio_newest = newest_row["XSUM"].values[0] / (
newest_row["RES_PS"].values[0] + newest_row["MIX_PS"].values[0]
)
ratio_oldest = oldest_row["XSUM"].values[0] / (
oldest_row["RES_PS"].values[0] + oldest_row["MIX_PS"].values[0]
)
diff = ratio_newest - ratio_oldest
print(
f"POP / PS (RES+MIX): {diff:.2%} ({ratio_oldest:.2%} -> {ratio_newest:.2%}) ({oldest_row['XSUM'].values[0]} -> {newest_row['XSUM'].values[0]})"
)