forked from volkovacodes/Block_Codes
-
Notifications
You must be signed in to change notification settings - Fork 0
/
5. parsing_prc_position.R
142 lines (127 loc) · 4.5 KB
/
5. parsing_prc_position.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
dir_in <- "/Volumes/KINGSTON/Blocks/Clean Forms/"
dir_out <- "/Volumes/KINGSTON/Blocks/Parsed Forms/"
start_year <- 1994
start_QTR <- 1
end_year <- 2018
end_QTR <- 4
require(RSQLite)
require(data.table)
### generate sequence of quaters
get_dates <- function(start_year, start_QTR, end_year, end_QTR)
{
require(data.table)
all_dates <- data.table(year = rep(1993:2050, 4))
setkey(all_dates,year)
all_dates[, QTR := 1:.N, by = year]
all_dates <- as.data.frame(all_dates)
x <- paste0(all_dates$year, all_dates$QTR) >= paste0(start_year, start_QTR) & paste0(all_dates$year, all_dates$QTR) <= paste0(end_year, end_QTR)
return(all_dates[x,])
}
### extract lines around where to search for information
get.lines <- function(x)
{
if(is.na(x)|is.na(x)) return(x)
y <- textConnection(x)
body <- unlist(readLines(y))
#body <- tolower(body)
### 98% of forms have length below 3000
### and I am looking for mentions of prc only in the first part
body <- body[1:3000]
body <- body[which(str_detect(body,"[:graph:]"))]
### we with 15 lines after each word "percent" and collapse them into one line
ind <- grep("percent", body, ignore.case = T)
lines <- NULL
for(i in ind) lines <- c(lines, paste(body[(i):(i+15)], collapse = " \n"))
close(y)
return(lines)
}
### extract positions of all investors
get.prc <- function(all_lines)
{
require(stringr)
### clean this lines from extra spaces
all_lines <- unlist(all_lines)
get.first.lines <- function(x,n) x <- paste(unlist(strsplit(x, "\n"))[1:n], collapse = " ")
locate.prc <- function(lines)
{
lines <- tolower(lines)
# at first we locate a line
# regex_find_line <- c("(?<=(row)).*(?=(type))",
# "(?<=(row)).*(?=(page))",
# "(?<=(row)).*(?=(cusip))",
# "(?<=(owned)).*(?=(type))",
# "(?<=(percent)).*(?=(type))")
# for(regex_fl in regex_find_line)
# {
# search.lines <- str_extract(lines, regex_fl)
# search.lines <- search.lines[grep("(\\d|n/a|none)",prc, perl = T)]
# if(length(prc) > 0) break
# }
### then search for percent expression
regex_find_prc <- c("(\\d{1,4}((\\,|\\.)\\d{0,7}|)( |)\\%|\\d{0,3}(\\.\\d{1,7}|)( |)\\%)",
"-0-",
"\\d{0,3}\\.\\d{1,7}", "\\d{0,3}\\.\\d{1,7}", "0 %")
for(regex_prc in regex_find_prc)
{
prc <- str_extract(lines, regex_prc)
prc <- prc[!is.na(prc)]
if(length(prc) > 0) break
}
return(prc)
}
for(end in 1:5*3)
{
lines <- lapply(all_lines, function(x) x <- get.first.lines(x,end))
lines <- gsub("(?<=[\\s])\\s*|^\\s+|\\s+$", "", lines, perl=TRUE)
lines <- gsub("240.13", "", lines)
lines <- gsub("-0-%", "0%", lines)
### here I have two spaces to find this pattern last
lines <- gsub("none|n/a|less|-0-|lessthan5%", "0 %", lines)
prc <- locate.prc(lines)
if(length(prc) > 0) break
}
return(paste(prc, collapse = "|"))
}
### extract the maximum position in the block
### in almost all cases maximum position is
### a aggregate position among subsidiaries
get.max.prc <- function(x)
{
x <- gsub("%", "", x)
x <- unlist(str_split(x,"\\|"))
# x[grep("none|n/a|-0-|less", x)] <- 0
x <- as.numeric(as.character(x))
### this 9 comes from row (9) in form in some filings
ind <- which(x %/% 100 == 9)
x[ind] <- x[ind] - 900
ind <- which(x %/% 100 == 11)
x[ind] <- x[ind] - 1100
ind <- which(x %/% 10 == 11)
x[ind] <- x[ind] - 110
x <- x[x<=100]
return(max(x,na.rm = T))
}
dates <- get_dates(start_year, start_QTR, end_year, end_QTR)
dates$year_QTR <- paste0(dates$year, dates$QTR)
for(yearqtr in dates$year_QTR)
{
print(Sys.time())
print(yearqtr)
dbname <- paste0(dir_in, "sc13_", yearqtr, ".sqlite")
con <- dbConnect(drv=RSQLite::SQLite(), dbname=dbname)
## Fetch data into data frame
res <- dbSendQuery(con, "SELECT FILENAME, FILING FROM filings")
res1 <- dbFetch(res,n=-1)
sec_name <- paste0(dir_out, "Parsed_forms_", yearqtr, ".rds")
sec_header <- readRDS(sec_name)
match <- match(sec_header$FILENAME, res1$FILENAME)
lines <- lapply(res1$FILING, get.lines)
prc <- sapply(lines, get.prc)
max.prc <- sapply(prc, get.max.prc)
### usually if information is missing here it is an exit filing
### information is missing in only ~1.5% of all forms
max.prc[is.na(max.prc)] <- 0
sec_header$max_prc <- max.prc[match]
sec_header$prc <- prc[match] ### I keep in just in case
saveRDS(sec_header, sec_name)
}