-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_cleaner.py
90 lines (70 loc) · 2.65 KB
/
data_cleaner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import re
from tqdm.contrib.concurrent import process_map
class DataCleaner:
def __init__(self, data):
self.data = data
self.patterns = {
'M': r'(^[A-Z\s\'&]+$)|SCENE ',
'S': r'^(EXT\.|INT\.)',
'N': r'^([A-Z ]+)(?=\[|$)',
'C': r'^[A-Z]+\s*$',
'T': r'^\[.*?\]$',
'E': r'^\([a-z\s]+\)$',
'D': r'^([a-zA-Z ]+)$',
}
def clean_script_line(self, line):
# Define the patterns for different parts, including transitions
# Define a function to identify the type of each line
def identify_line_type(line):
for key, pattern in self.patterns.items():
if re.match(pattern, line):
return key
return None
# Identify the type of the line
line_type = identify_line_type(line)
if line_type:
return f'{line_type}: {line}'
else:
return line
def clean_movie_script(self):
# Split content based on newlines
lines = self.data.split('\n')
# Use process_map to clean each line concurrently
cleaned_lines = map(self.clean_script_line, lines)
cleaned_lines = [line for line in cleaned_lines if len(line) > 0]
new_cleaned_lines = []
for line in cleaned_lines:
for p in self.patterns.keys():
if line.startswith(f'{p}:'):
new_cleaned_lines.append(line)
# Combine the cleaned lines into a single string
cleaned_script = '\n'.join(new_cleaned_lines)
return cleaned_script
@staticmethod
def extract_author_and_text(paragraph):
match = re.match(r'C: ([A-Z\s]+)', paragraph)
if match:
return match.group(1)
return None
def extract_author_text_pairs(self, content):
lines = content.split('\n')
author = None
pairs = []
text_lines = []
for line in lines:
if line.startswith('C: '):
if author:
pairs.append((author, ' '.join(text_lines)))
author = self.extract_author_and_text(line)
text_lines = []
elif line.startswith('D: '):
if author:
text_lines.append(line[3:].strip())
else:
if author:
pairs.append((author, ' '.join(text_lines)))
author = None
text_lines = []
if author:
pairs.append((author, ' '.join(text_lines)))
return pairs