forked from volkovacodes/Block_Codes
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path6. parsing_block_identity.R
150 lines (140 loc) · 5.45 KB
/
6. parsing_block_identity.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
dir_in <- "/Volumes/KINGSTON/Blocks/Clean Forms/"
dir_out <- "/Volumes/KINGSTON/Blocks/Parsed Forms/"
start_year <- 1994
start_QTR <- 1
end_year <- 2018
end_QTR <- 4
require(RSQLite)
require(data.table)
### generate sequence of quaters
get_dates <- function(start_year, start_QTR, end_year, end_QTR)
{
require(data.table)
all_dates <- data.table(year = rep(1993:2050, 4))
setkey(all_dates,year)
all_dates[, QTR := 1:.N, by = year]
all_dates <- as.data.frame(all_dates)
x <- paste0(all_dates$year, all_dates$QTR) >= paste0(start_year, start_QTR) & paste0(all_dates$year, all_dates$QTR) <= paste0(end_year, end_QTR)
return(all_dates[x,])
}
### list of all identities codes
code_list <- c("bd", "bk", "ic", "iv", "ia", "ep", "hc", "sa", "cp", "co", "pn", "in", "fi", "oo")
### I general regex to capture all variation of identity codes
gen_codes_regex <- function()
{
codes <- NULL
codes[[1]] <- c("broker\\s+dealer", "bd")
codes[[2]] <- c("bank", "bk")
codes[[3]] <- c("insurance\\s+company", "ic")
codes[[4]] <- c("investment\\s+company", "iv")
codes[[5]] <- c("investment\\s+advisor", "ia")
codes[[6]] <- c("employee\\s+benefit", "ep")
codes[[7]] <- c("holding\\s+company", "hc")
codes[[8]] <- c("savings\\s+association", "sa")
codes[[9]] <- c("church\\s+plan", "cp")
codes[[10]] <- c("corporation", "co\\s", "c0") ### company sometimes is catched as corporation
codes[[11]] <- c("partnership", "pn")
codes[[12]] <- c("individual", "in", "in")
codes[[13]] <- c("non-U.S.\\s+institution", "fi")
codes[[14]] <- c("other", "oo", "o0", "00", "0.0", "o.o", "o.0", "0.o")
for(i in 1:14)
{
codes[[i]] <- c(codes[[i]],
paste0(code_list[i], 2), # 2 is for second part of the form, sometimes it mergers with the previous answer
paste0(code_list[i], "page"), # the same with the word page
paste0("person",code_list[i])) # sometimes there are no space before the word person
}
regex_codes <- lapply(codes, function(x)paste0("(\\b", x, "\\b)", collapse = "|"))
return(regex_codes)
}
regex_codes <- gen_codes_regex()
### this function looks for a line with phare "type of reporting person"
### and returns type codes from the next lines
get_phares <- function(text)
{
require(stringr)
text <- tolower(text)
text <- unlist(strsplit(text, "\n"))
text <- text[which(str_detect(text,"[:graph:]"))]
ind <- grep("type\\s+(of|in|or)\\s+reporting\\s+person", text, perl = T)
if(length(ind) == 0) return(NA)
item12 <- NA
### first I check the next line, than line + 1 etc. up to line + 5
for(i in 0:5)
{
#print(i)
lines <- text[ind + i]
vector <- sapply(regex_codes, function(x) max(grepl(x, lines, perl = T)))
if(sum(vector) == 0) next
#print(vector)
item12 <- paste(code_list[which((vector) == 1)],collapse = "|")
return(item12)
}
return(item12)
}
### sometimes phares "type of reporting person" is splitted across several lines
### I merge everything into one line and search for a phrase here
### (this function is significantly slower than the previous one)
get_phares_one_line <- function(text)
{
require(stringr)
text <- tolower(text)
text <- unlist(strsplit(text, "\n"))
text <- text[which(str_detect(text,"[:graph:]"))]
### some forms could be too long,
### so I look only at the first 3000 lines
text <- text[1:3000]
text <- paste(text, collapse = " ")
text <- trimws(text)
text <- gsub("\\s+", " ", text, perl = T)
#ind <- grep("type\\s+(of|in|or)\\s+reporting\\s+person", text, perl = T)
loc <- as.data.frame(str_locate_all(text, "type\\s(of|in|or)\\sreporting\\sperson"))
if(length(loc$start) == 0) loc <- as.data.frame(str_locate_all(text, "type\\s(of|in|or)\\sperson\\sreporting"))
if(length(loc$start) == 0) return(NA)
item12 <- NA
for(str_len in c(20, 40, 60, 80, 100, 120, 160))
{
#print(i)
lines <- NULL
for(start in loc$start)
{
lines <- c(lines, substr(text, start, start + str_len))
}
vector <- sapply(regex_codes, function(x) max(grepl(x, lines, perl = T)))
if(sum(vector) == 0) next
#print(vector)
item12 <- paste(code_list[which((vector) == 1)],collapse = "|")
return(item12)
}
return(item12)
}
dates <- get_dates(start_year, start_QTR, end_year, end_QTR)
dates$year_QTR <- paste0(dates$year, dates$QTR)
for(yearqtr in dates$year_QTR)
{
print(Sys.time())
print(yearqtr)
### read master file
sec_name <- paste0(dir_out, "Parsed_forms_", yearqtr, ".rds")
sec_header <- readRDS(sec_name)
sec_header <- as.data.table(sec_header)
### read filings
dbname <- paste0(dir_in, "sc13_", yearqtr, ".sqlite")
con <- dbConnect(drv=RSQLite::SQLite(), dbname=dbname)
## Fetch data into data frame
res <- dbSendQuery(con, "SELECT FILENAME, FILING FROM filings")
res1 <- dbFetch(res,n=-1)
### check the order just in case
match <- match(res1$FILENAME, sec_header$FILENAME)
sec_header <- sec_header[match]
### call identity item12
sec_header$item12 <- NA
### fidelity has its own formatting
sec_header$item12[sec_header$fil_CIK == "0000315066"] <- "hc|in"
### the match a majority with the first faster function
sec_header$item12[is.na(sec_header$item12)] <- sapply(res1$FILING[is.na(sec_header$item12)], get_phares)
### the rest is match with more diligent but slower function
sec_header$item12[is.na(sec_header$item12)] <- sapply(res1$FILING[is.na(sec_header$item12)], get_phares_one_line)
print(mean(is.na(sec_header$item12)))
saveRDS(sec_header, sec_name)
}