-
Notifications
You must be signed in to change notification settings - Fork 0
/
sec_utils.py
79 lines (67 loc) · 2.97 KB
/
sec_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import re
def get_tree_data(element, depth=0, list_depth=0, prev_depth=-1, lists=None, final_dict=None):
if lists is None:
lists = [[]]
if final_dict is None:
final_dict = {}
if element.tag == "HEAD":
if depth > list_depth:
lists[-1].append(f"{element.text.strip()}")
else:
lists.append(lists[-1][:depth-2])
lists[-1].append(f"{element.text.strip()}")
elif element.tag == "P":
# "P" indicates that there is text associated with the element
# This only saves to the final_dict if the element has a child with tag "P"
try:
final_dict[str(lists[-1])] += ''.join(element.itertext())
except:
final_dict[str(lists[-1])] = ''.join(element.itertext())
else:
pass
# Recursively call get_tree_data on each child, increasing the depth
for child in element:
get_tree_data(child, depth + 1, len(lists[-1])+1, depth, lists, final_dict)
return final_dict
def get_metadata(s, text, metadata={"title":None, "chapter":None, "part":None, "subpart":None, "section":None, "description":None, "mentioned_sections":None}):
s_split = s[2:-2].split("', '")
key_lengths = {key: len(key) for key in metadata}
for x in s_split:
xsplit = x.lower().split("—")
if len(xsplit) > 1:
prefix = xsplit[0]
for key, length in key_lengths.items():
if prefix[:length] == key:
metadata[key] = x
break
metadata["description"] = s_split[-1]
metadata["section"] = s_split[-1].split(" ")[0]#.split(" ")[-1].strip()
metadata["mentioned_sections"] = get_linked_sections(text)
return metadata
def get_rule_and_description(document):
first_split = document.metadata['description'].split(" ")
rule = first_split[0].strip()#.split()[-1].strip()
rule_description = first_split[-1].strip()
return rule, rule_description
import re
def get_linked_sections(doc):
cleaned_sections = []
pattern = pattern = r'§§?\s\d{3}\.\d+(?:\s(?:and|through)\s\d{3}\.\d+)?'
linked_sections = re.findall(pattern, doc)
for section in linked_sections:
# Check if the section contains 'and' or 'through'
if 'and' in section or 'through' in section:
# Remove the '§§' prefix if present
section = section.replace('§§', '§')
# Split the section into individual parts
parts = re.split(r'\s(?:and|through)\s', section)
# Add each part to the cleaned_sections list, ensuring each part has the '§' prefix
for part in parts:
if part[0] != '§':
cleaned_sections.append(part.strip())
else:
cleaned_sections.append(part.strip())
else:
# If the section is already a single section, add it to the list
cleaned_sections.append(section.strip())
return list(set(cleaned_sections))