-
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathbuild-db.sh
executable file
·71 lines (62 loc) · 1.49 KB
/
build-db.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#!/bin/bash
# Builds sfms.db using data from index.db - which is created using
# s3-ocr index sfms-history index.db
rm -f sfms.db
# Create tables
sqlite-utils create-table sfms.db documents \
id text \
title text \
path text \
etag text \
--pk id
sqlite-utils create-table sfms.db pages \
id text \
document_id text \
page integer \
text text \
--pk id
sqlite-utils add-foreign-key sfms.db \
pages document_id documents id
# Populate documents
sqlite-utils sfms.db --attach index2 index.db "$(cat <<EOF
insert into documents select
substr(s3_ocr_etag, 2, 8) as id,
key as title,
key as path,
replace(s3_ocr_etag, '"', '') as etag
from
index2.ocr_jobs
where
key in (
select path from index2.pages
where (
folder like 'INTAKE/%'
or folder like 'PUBLIC/%'
)
and folder not like '%PROCESSED INTAKE DOCUMENTS/%'
)
EOF
)"
# Populate pages
sqlite-utils sfms.db --attach index2 index.db "$(cat <<EOF
insert into pages select distinct
substr(s3_ocr_etag, 2, 8) || '-' || page as id,
substr(s3_ocr_etag, 2, 8) as document_id,
page,
text
from index2.pages
join index2.ocr_jobs
on index2.pages.path = index2.ocr_jobs.key
where
(
folder like 'INTAKE/%'
or folder like 'PUBLIC/%'
)
and folder not like '%PROCESSED INTAKE DOCUMENTS/%'
EOF
)"
# Fix document titles
sqlite-utils convert sfms.db documents title \
'value.split("/")[-1].split(".pdf")[0].replace("_", " ")'
# Enable full-text search
sqlite-utils enable-fts sfms.db pages text