-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcleanTransactions.R
125 lines (105 loc) · 5.58 KB
/
cleanTransactions.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# load dependencies
library(lubridate)
library(readr)
# clean Transaction data
cleanTransactions <- function(baseDir, filename_raw, filename_clean, sample_n=NULL, sample_seed=NULL) {
#### DATA IMPORT ####
# define schema and column types
col_types <- readr::cols(DataId = col_integer(),
MeterCode = col_integer(),
TransactionId = col_integer(),
TransactionDateTime = col_datetime("%m/%d/%Y %H:%M:%S"),
Amount = col_double(),
UserNumber = col_integer(),
PaymentMean = col_character(),
PaidDuration = col_integer(),
ElementKey = col_integer(),
TransactionYear = col_integer(),
TransactionMonth = col_integer(),
Vendor = col_character())
# import transaction data
filepath_raw <- paste0(baseDir, filename_raw)
transactions <- readr::read_csv(filepath_raw, col_types = col_types, na="NULL")
#### DATA CLEANING ####
# replace missing PaymentMean values and convert to uppercase to eliminate inconsistencies
transactions$PaymentMean[transactions$PaymentMean=="$record.get(\"PAYMENT_TYPE\")"] <- NA
transactions$PaymentMean[transactions$PaymentMean=="Credit Card"] <- "CREDIT CARD"
transactions$PaymentMean[transactions$PaymentMean=="Smart Card"] <- "SMART CARD"
transactions$PaymentMean[transactions$PaymentMean=="Phone"] <- "PHONE"
transactions$PaymentMean[transactions$PaymentMean=="CASH"] <- "COINS"
# pull out transaction date and time
transactions$TransactionDate <- date(transactions$TransactionDateTime)
# TODO: consider adding seconds to this time, since this might be used when pulled back in to R anyway...
transactions$timeStart <- paste(sprintf("%02d", hour(transactions$TransactionDateTime)),
sprintf("%02d", minute(transactions$TransactionDateTime)),
sep=":")
# add expired time column
timeExpired <- transactions$TransactionDateTime + transactions$PaidDuration
# TODO: consider adding seconds to this time, since this might be used when pulled back in to R anyway...
transactions$timeExpired <- paste(sprintf("%02d", hour(timeExpired)),
sprintf("%02d", minute(timeExpired)),
sep=":")
# NOTE: the three columns above might not be fully representative of parking times
# NOTE: Per Fremont pay station, users can pre-pay for the next morning starting at 10pm
# remove rows with erroneous Amount values (i.e. charges > $25)
transactions <- transactions[(is.na(transactions$Amount) | transactions$Amount<=25),]
# convert PaidDuration to minutes
transactions$Duration_mins <- round(transactions$PaidDuration/60, 0)
# remove UserNumber values != (1 | NA)
# Documentation says this value should be 1; vast majority of records match this condition
transactions <- transactions[(transactions$UserNumber==1 | is.na(transactions$UserNumber)),]
# remove records w/ Duration_mins values <=0 and >600 (10 hrs max allowed)
transactions <- transactions[(is.na(transactions$Duration_mins) |
transactions$Duration_mins>0 |
transactions$Duration_mins<=600),]
# drop columns that have no (useful) data
cols_to_drop <- c("DataId",
"UserNumber",
"TransactionYear",
"TransactionMonth",
"PaidDuration",
"Vendor")
cols_to_drop <- colnames(transactions) %in% cols_to_drop
# unique(transactions[,cols_to_drop])
transactions <- transactions[,!cols_to_drop]
# reorder remaining columns
newColOrder <- c("TransactionId","TransactionDateTime", "TransactionDate",
"timeStart", "timeExpired", "Duration_mins",
"Amount", "PaymentMean",
"MeterCode", "ElementKey")
transactions <- transactions[,newColOrder]
#### DATA EXPORT ####
# pull out transactions data sample if sample_n is not null
if (!is.null(sample_n)) {
if (sample_n > nrow(transactions)) {
message(paste0("sample_n > nrow(transactions) after cleaning; returned all available results (",
nrow(transactions), ")"))
sample_n <- nrow(transactions)
}
if (!is.null(sample_seed)) {
set.seed(sample_seed)
}
transactions <- transactions[sample(nrow(transactions),sample_n),]
}
# write cleaned data to new .csv file
filepath_clean <- paste0(baseDir, filename_clean)
readr::write_csv(transactions, filepath_clean, na = "")
}
# #### SAMPLE CODE ####
#
# baseDir <- "~/Desktop/3 - DATA 512/Assignments/A6 - Final Project/data/raw/"
#
# # full dataset; WARNING: takes a LONG time!
# filename_raw <- "ParkingTransaction_20120101_20170930_raw.csv"
# filename_clean <- "../ParkingTransaction_20120101_20170930_cleaned.csv"
# cleanTransactions(baseDir, filename_raw, filename_clean)
#
# # 1000000-row sample
# filename_raw <- "ParkingTransaction_20120101_20170930_raw.csv"
# filename_clean_sample <- "../ParkingTransaction_cleanedSAMPLE.csv"
# cleanTransactions(baseDir, filename_raw, filename_clean_sample, 1000000, 228)
#
# # # test code on a single file for troubleshooting, etc.
# # filename_raw_test <- "transactions_by_week/ParkingTransaction_20120429_20120505.csv"
# # filename_clean_test <- "../ParkingTransaction_cleanedTESTTESTTEST.csv"
# # cleanTransactions(baseDir, filename_raw_test, filename_clean_test)