-
Notifications
You must be signed in to change notification settings - Fork 554
/
Intro to R - Make Yellow_Sample.R
36 lines (30 loc) · 1.23 KB
/
Intro to R - Make Yellow_Sample.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
# Bringing in column headers as names and using them to set names
if (!require("data.table")) install.packages("data.table")
library("data.table")
header <- read.table("yellow_tripdata_2017-06.csv", header = TRUE,
sep=",", nrow = 1)
DF <- fread("yellow_tripdata_2017-06.csv", skip=1, sep=",",
header=FALSE, data.table=FALSE)
setnames(DF, colnames(header))
# Save the "bad" observations so we can clean them out
# in the next chapter
DF2<-DF[which(DF$total_amount<=0 |
DF$fare_amount >=100000 |
DF$fare_amount < 0 |
DF$trip_distance >= 100),]
# Reform DF with only the "good" observations
DF<-DF[which(DF$total_amount >0 &
DF$fare_amount <100000 &
DF$fare_amount >= 0 &
DF$trip_distance < 100),]
# Select a random subsample of 1,000,000 rows
set.seed(10)
index <- sample(1:nrow(DF), 1000000, replace=FALSE)
# Look at the index to see it's just row numbers
head(index)
# Copy the row numbers for the sample only into Yellow_Sample
Yellow_Sample <- DF[index,]
# Concatenate (or "bind") the random sample and the "bad" ones
Yellow_Sample <- rbind(DF2,Yellow_Sample)
# Save the data frame as csv
fwrite(Yellow_Sample,"Yellow_Sample.csv")