-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpark_info_data_cleaner.R
123 lines (100 loc) · 4.79 KB
/
park_info_data_cleaner.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
# data originally from:
# https://data.sfgov.org/Culture-and-Recreation/Recreation-Park-Department-Park-Info-Dataset
parks <- read_csv('data/Recreation___Park_Department_Park_Info_Dataset.csv')
# Things I want to fix:
# Remove double header row
# Get missing location data
# Split Lat and Long into own columns
# Get street numbers for the missing (reverse geo code)
# remove extra header row
parks <- parks[2:nrow(parks),]
dim(subset(parks, is.na(`Location 1`)))
address_fixer <- function(park,location){
parks[parks$ParkName == park,]$`Location 1` <<- location
}
# You can actually get all these from the JSON verson, they are stored
# differently from the rest for some reason.
pairs <- list(c('Arkansas Friendship Garden',
'22nd/Arkansas, San Francisco, CA, (37.75737790, -122.39766207)'),
c('BAY VIEW PARK',
"LeConte Ave, San Francisco, CA, (37.71513230, -122.39227019)"),
c('BERNAL HEIGHTS PARK',
"Bernal Heights Blvd, San Francisco, CA, (37.74313167, -122.41425204)"),
c('Connecticut Friendship Garden',
"22nd/Connecticut, San Francisco, CA, (37.75739852, -122.39731712)"),
c('CRAGS COURT GARDEN',
'Crags Ct, San Francisco, CA, (37.74141960, -122.44067225)'),
c('Dog Patch-Miller Memorial Comm. Garden',
'Brewster/Rutledge St, San Francisco, CA, (37.74503985, -122.40715464)'),
c('ESPRIT PARK',
'Minnesota St, San Francisco, CA, (37.76098611, -122.39099369)'),
c('Garden for the Environment',
'7th Ave North of Lawton, San Francisco, CA, (37.75911093, -122.46357301'),
c('HOLLY PARK',
'Holly Park Cir, San Francisco, CA, (37.73720955, -122.41994010'),
c('Hooker Alley Community Garden',
'Hooker Aly, San Francisco, CA, (37.79043353, -122.41017264)'),
c('LAKE MERCED PARK',
'Lake Merced Blvd, San Francisco, CA, (37.72183556, -122.49238175)'),
c('MT DAVIDSON PARK',
'Myra Wy, San Francisco, CA, (37.73906280, -122.45465821)'),
c('MT OLYMPUS',
'Upper Terrace, San Francisco, CA, (37.76330762, -122.44554051)' ),
c("O'SHAUGHNESSY HOLLOW",
"O'Shaughnessy Blvd, San Francisco, CA, (37.74016899, -122.44487598)"),
c('PARK PRESIDIO BLVD',
'Park Presidio Blvd, San Francisco, CA, (37.78012283, -122.47228457)'),
c('RIDGETOP PLAZA',
'Whitney Young Cire, San Francisco, CA, (37.73379832, -122.38345175)'),
c('TELEGRAPH HILL/PIONEER PARK',
'Telegraph Hill Blvd, San Francisco, CA, (37.80278413, -122.40583596)'),
c('TOPAZ OPEN SPACE',
'Diamond Heights, San Francisco, CA, (37.74170413, -122.43616246)'),
c('TWIN PEAKS',
'Twin Peaks Blvd, San Francisco, CA, (37.75279086, -122.44748390)'),
c('WILLIE WOO WOO WONG PLAYGROUND',
'Sacramento St, San Francisco, CA, (37.79358605, -122.40718489)'),
c('YACHT HARBOR AND MARINA GREEN',
'Marina Blvd, San Francisco, CA, (37.80677550, -122.44076019)')
)
# Convert to dataframe
locations <- do.call(rbind, sapply(pairs, data.frame, stringsAsFactors=FALSE))
# OH NOOOOOOO A LOOP!!! Apply was fighting me and this runs fast.
for(p_row in 1:nrow(locations)){
address_fixer(locations[p_row,1],locations[p_row,2])
}
# Only ones left are the GGP sections
dim(subset(parks, is.na(`Location 1`)))
# General cleaning
parks$`Location 1` <- gsub('\n',' ',parks$`Location 1`)
parks$`Location 1` <- gsub('&', '&', parks$`Location 1`)
# Split Lat and Long to own columns
parks$Lat <- gsub( '.*[(]','', parks$`Location 1`) %>%
gsub('[,].*','',.)
parks$Long <- gsub( '.*[, ]','',parks$`Location 1`) %>%
gsub('[)].*','',.)
write_csv(parks, 'data/cleaned_parks_info.csv')
quantile(parks$Acreage, c(.2,.4,.6,.8,1))
test <- function(row){
if(row['Acreage'] <.210){
'size_tiny'
}else if(row['Acreage'] > .210 & row['Acreage'] < .800){
'size_small'
}else if(row['Acreage'] > .800 & row['Acreage'] < 2.186){
'size_average'
}else if(row['Acreage'] > 2.186 & row['Acreage'] < 6.266){
'size_large'
}else{
'size_huge'
}
}
park_size <- c()
for(x in 1: nrow(parks)){
park_size <- c(park_size,test(parks[x,]))
}
parks$size <- park_size
# read in data for mapping to neighborhoods
zip2hood <- read_csv('data/zipcode_to_neighborhood.csv')
# This will drop the sections of Golden Gate, fine for our use
parks_neighborhoods <- merge(parks, zip2hood, by.x='Zipcode', by.y='zipcode')
write_csv(parks_neighborhoods, 'data/parks_info_with_size_neighborhood.csv')