-
Notifications
You must be signed in to change notification settings - Fork 1
/
UCS investigation_Tyson water pollution_R code_SEW.R
399 lines (281 loc) · 11.8 KB
/
UCS investigation_Tyson water pollution_R code_SEW.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
# # For UCS investigation into water pollution from Tysons meat processing plants
# Work done in 2023, report expected release April 2024
# Stacy Woods
#####################################################################
# packages
library(tidyverse)
#################################################################################
########################NPDES ECHO DMR data #########
#################################################################################
setwd("...")
getwd()
DMR1822 <- read.csv(file="DMR1822all.csv", header=TRUE, sep=",")
### explore data ###
colnames(DMR1822)
nrow(DMR1822)
str(DMR1822)
sum(DMR1822$Total.Pounds..lb.yr.)
# Note that EPA requires states to enter facilities w major status into NPDES
# but does NOT require entry for non-major facilities
sum(DMR1822$Total.TWPE..lb.eq.yr.)
#note Total TWPE = Toxic Weighted Pounds Equivalent (TWPE) is the
# mass of a pollutant or chemical discharged that accounts for its
# relative toxicity. To convert pollutant loadings into TWPE, EPA
# multiplies the pollutant mass (in pounds) by its toxic weighting
# factor (TWF).
#note SIC code describes the primary activity of the facility
# Count number of unique states
length(unique(DMR1822$State))
# States?
unique(DMR1822$State)
#FRS.ID?
unique(DMR1822$FRS.ID)
# Is not a unique ID for facility
############ clean data: pass 1 ############
# Create column of fac name + city + county + state
DMR1822$fac.ci.co.st <- paste(DMR1822$Facility.Name, ",", DMR1822$City, ",", DMR1822$County, ",", DMR1822$State)
colnames(DMR1822)
unique(DMR1822$fac.ci.co.st)
# 49 unique facilities
### Remove facility TYSON MHP , WEST MONROE , OUACHITA , LA because this is a mobile home park, not a
# meat packing facility, and therefore not of interest to this investigation
colnames(DMR1822)
unique(DMR1822$fac.ci.co.st)
DMR1822v1 <- subset(DMR1822, fac.ci.co.st!= "TYSON MHP , WEST MONROE , OUACHITA , LA")
# States?
unique(DMR1822v1$State)
# How many unique facilities?
unique(DMR1822v1$fac.ci.co.st)
############ Investigation: Nitrate ############
unique(DMR1822v1$Pollutant.Name)
# isolate "Nitrate nitrogen"
nitrate <- subset(DMR1822v1, Pollutant.Name== "Nitrogen, nitrate dissolved")
colnames(nitrate)
unique(nitrate$Pollutant.Name)
# How many pounds nitrate Tyson released over study period?
sum(nitrate$Total.Pounds..lb.yr.)
# Where did Tysons release nitrate into waters during study period?
unique(nitrate$State)
# How much nitrate in NE?
NEnitrate <- subset(nitrate, State == "NE")
colnames(NEnitrate)
unique(NEnitrate$State)
sum(NEnitrate$Total.Pounds..lb.yr.)
# How much nitrate in TX?
TXnitrate <- subset(nitrate, State == "TX")
colnames(TXnitrate)
unique(TXnitrate$State)
sum(TXnitrate$Total.Pounds..lb.yr.)
############ clean data: pass 2 ############
### Remove duplicate pollutants ###
# Per EPA, some chemicals are represented in more than one pollutant category
# in DMR:
# (1) the "nitrogen" pollutant category also includes data for the following categories:
# Ammonia as N, Inorganic nitrogen, Nitrite nitrogen, Nitrate nitrogen, Organic nitrogen,
# Total Kjeldahl nitrogen
# (2) the “phosphorous" category also includes data for: total phosphate
# (3) the "chromium" category also includes data for: hexavalent chromium
# create new dataset, removing the "also includes" listed above
colnames(DMR1822v1)
unique(DMR1822v1$Pollutant.Name)
DMR1822v2 <- subset(DMR1822v1, Pollutant.Name!= "Ammonia as N" & Pollutant.Name!= "Inorganic Nitrogen"
& Pollutant.Name!="Nitrite nitrogen, dissolved (as N)" & Pollutant.Name!= "Nitrogen, nitrate dissolved"
& Pollutant.Name!= "Organic Nitrogen" & Pollutant.Name!= "Total Kjeldahl Nitrogen" &
Pollutant.Name!= "Phosphate, total (as PO4)" & Pollutant.Name!= "Chromium, Hexavalent")
colnames(DMR1822v2)
nrow(DMR1822v2)
# States?
unique(DMR1822v2$State)
# How many unique facilities?
unique(DMR1822v2$fac.ci.co.st)
# How many unique pollutants?
unique(DMR1822v2$Pollutant.Name)
getwd()
write.csv(DMR1822v2,'DMR1822cleanpollutant.csv')
#################################
### reorg and summarize data ###
#################################
######## Long to wide format - total by pollutant
colnames(DMR1822v2)
nrow(DMR1822v2)
# Create df with only columns of interest
mypolldf <- DMR1822v2[c("Year", "Pollutant.Name", "Total.Pounds..lb.yr.", "Total.TWPE..lb.eq.yr.")]
colnames(mypolldf)
# Pivot long to wide, summing total lbs & tot TWPE
mypolldfwd <- mypolldf %>%
pivot_wider(
names_from = Pollutant.Name,
values_from = c(Total.Pounds..lb.yr., Total.TWPE..lb.eq.yr.),
values_fn = sum
)
nrow(mypolldfwd)
colnames(mypolldfwd)
str(mypolldfwd)
getwd()
write.csv(mypolldfwd,'pollutant totals by year.csv')
######## Long to wide format - facility total
# Create df with only columns of interest
colnames(DMR1822v2)
mydf <- DMR1822v2[c("Year", "fac.ci.co.st", "Facility.Name", "City", "County" , "State",
"HUC.12.Code" , "Watershed.Name", "Facility.Latitude", "Facility.Longitude",
"Pollutant.Name", "Contains.Potential.Outliers." ,
"Total.Pounds..lb.yr.", "Total.TWPE..lb.eq.yr.")]
colnames(mydf)
nrow(DMR1822v2)
nrow(mydf)
## Create a wide dataframe where if multiple values then sum ##
# Use values_fn
# First convert col "Contains Potential Outliers?" to Outliers 1/0
unique(mydf$Contains.Potential.Outliers.)
# frequency table for different unique values of Potential outliers
table(mydf$Contains.Potential.Outliers.)
mydf$PotOutlier <- case_when(mydf$Contains.Potential.Outliers.=="Y" ~ 1,
mydf$Contains.Potential.Outliers.=="" ~ 0)
unique(mydf$PotOutlier)
table(mydf$PotOutlier)
# 12 outliers, 1504 non-outliers
# Create new df that drops the text "Contains.Potential.Outliers." column but retains
# the numeric "PotOutlier" column
mydf2 <- mydf[c("Year", "fac.ci.co.st", "Facility.Name", "City", "County" , "State",
"HUC.12.Code" , "Watershed.Name", "Facility.Latitude", "Facility.Longitude",
"Pollutant.Name", "PotOutlier" ,
"Total.Pounds..lb.yr.", "Total.TWPE..lb.eq.yr.")]
# Pivot long to wide, summing PotOutliers as well as total lbs
colnames(mydf2)
mydfwd <- mydf2 %>%
pivot_wider(
names_from = Pollutant.Name,
values_from = c(Total.Pounds..lb.yr., Total.TWPE..lb.eq.yr., PotOutlier),
values_fn = sum
)
nrow(mydfwd)
colnames(mydfwd)
str(mydfwd)
getwd()
write.csv(mydfwd,'DMR1822wideV1.csv')
colnames(mydfwd)
# create summary columns for Total pounds per year
# using tidyverse mutate
mydfwd2 <- mydfwd %>%
mutate(SumTotLbsYr = select(., "Total.Pounds..lb.yr._Solids, total dissolved":"Total.Pounds..lb.yr._Naphthalene") %>% rowSums(na.rm = TRUE))
colnames(mydfwd2)
str(mydfwd2$SumTotLbsYr)
# create summary column for Total TWPE pounds per year
colnames(mydfwd2)
mydfwd3 <- mydfwd2 %>%
mutate(SumTotTWPELbsYr = select(., "Total.TWPE..lb.eq.yr._Solids, total dissolved":"Total.TWPE..lb.eq.yr._Naphthalene") %>% rowSums(na.rm = TRUE))
colnames(mydfwd3)
str(mydfwd3$SumTotTWPELbsYr)
# create summary column for potential outliers per facility per year
colnames(mydfwd3)
mydfwd4 <- mydfwd3 %>%
mutate(SumPotOutlierYr = select(., "PotOutlier_Solids, total dissolved":"PotOutlier_Naphthalene") %>% rowSums(na.rm = TRUE))
colnames(mydfwd4)
str(mydfwd4$SumPotOutlierYr)
# Simplify data to columns of interest (summary columns)
colnames(mydfwd4)
DMR1822wd<- mydfwd4[c("Year", "fac.ci.co.st", "Facility.Name", "City", "County" , "State",
"HUC.12.Code" , "Facility.Latitude",
"Facility.Longitude", "SumTotLbsYr" , "SumTotTWPELbsYr", "SumPotOutlierYr" )]
colnames(DMR1822wd)
# Pivot wider:
# summing PotOutliers as well as total lbs so that each facility has one only row, and years are represented by columns
# with yr attached to (1) SumTotLbsYr, (2) SumTotTWPELbsYr, (3) SumPotOutlierYr
DMR1822wdr <- DMR1822wd %>%
pivot_wider(
names_from = Year,
values_from = c(SumTotLbsYr, SumTotTWPELbsYr, SumPotOutlierYr),
values_fn = sum
)
nrow(DMR1822wdr)
colnames(DMR1822wdr)
str(DMR1822wdr)
getwd()
write.csv(DMR1822wdr,'DMR1822widerV1.csv')
# create summary column for Total pounds
# using tidyverse mutate
colnames(DMR1822wdr)
mydfwdr <- DMR1822wdr %>%
mutate(SumTotLbs1822 = select(., "SumTotLbsYr_2018":"SumTotLbsYr_2022") %>% rowSums(na.rm = TRUE))
colnames(mydfwdr)
str(mydfwdr$SumTotLbs1822)
# create summary column for Total TWPE pounds
mydfwdr2 <- mydfwdr %>%
mutate(SumTotTWPELbs1822 = select(., "SumTotTWPELbsYr_2018":"SumTotTWPELbsYr_2022") %>% rowSums(na.rm = TRUE))
colnames(mydfwdr2)
str(mydfwdr2$SumTotTWPELbs1822)
# create a summary column for total outliers
mydfwdr3 <- mydfwdr2 %>%
mutate(SumPotOutliers1822 = select(., "SumPotOutlierYr_2018":"SumPotOutlierYr_2022") %>% rowSums(na.rm = TRUE))
colnames(mydfwdr3)
str(mydfwdr3$SumPotOutliers1822)
#########################################################
##### ANALYSIS, all years data combined #####
#########################################################
# Note: data of interest = total pounds per year
### Top 5 polluters, all pollutants combined, 2018-2022 cumulative, by amount
## To return top & bottom 10:
head(mydfwdr3[order(-mydfwdr3$SumTotLbs1822),],10)
tail(mydfwdr3[order(-mydfwdr3$SumTotLbs1822),],10)
# export as csv
getwd()
write.csv(mydfwdr3,'DMR1822summaries by plant.csv')
###############################################################
###############################################################
###
### DMR SUMMARY ###
###
###############################################################
###############################################################
DMR1822sumbyplant <- read.csv(file="DMR1822summaries by plant.csv", header=TRUE, sep=",")
colnames(DMR1822sumbyplant)
nrow(DMR1822sumbyplant)
str(DMR1822sumbyplant)
# Count of unique facilities
length(unique(DMR1822sumbyplant$fac.ci.co.st))
# Count number of unique states
length(unique(DMR1822sumbyplant$State))
# which states
unique(DMR1822sumbyplant$State)
# which pollutants
# use older df, removing duplicates
unique(DMR1822v2$Pollutant.Name)
# how many outliers
sum(DMR1822sumbyplant$SumPotOutliers1822)
#total pounds discharge, all plants
sum(DMR1822sumbyplant$SumTotLbs1822)
# total pounds hazard weighted discharge, all plants
sum(DMR1822sumbyplant$SumTotTWPELbs1822)
##############
### Worst offender facilities ###
#############
# Focus on Total pounds, not hazard weighted pounds, for simplicity in messaging
colnames(DMR1822sumbyplant)
nrow(DMR1822sumbyplant)
# Create simple dataframe w data of interest
facilitydf <- DMR1822sumbyplant[c("Facility.Name", "City" , "County" , "State" ,
"Facility.Latitude" , "Facility.Longitude" , "SumTotLbs1822")]
nrow(facilitydf)
colnames(facilitydf)
str(facilitydf)
getwd()
write.csv(facilitydf,'Facility summary_DMR total lbs_1822.csv')
##############
### States w worst offenders ###
#############
colnames(DMR1822sumbyplant)
nrow(DMR1822sumbyplant)
# Create simple dataframe w data of interest
tempdf <- DMR1822sumbyplant[c("State" , "SumTotLbs1822" )]
nrow(tempdf)
# Combine rows w same column value for states, sum other column values
colnames(tempdf)
statesdf <- tempdf %>%
group_by(State) %>%
summarise(across(c(SumTotLbs1822), sum))
nrow(statesdf)
colnames(statesdf)
str(statesdf)
getwd()
write.csv(statesdf,'State summary_DMR total lbs_1822.csv')
### END ###