-
Notifications
You must be signed in to change notification settings - Fork 0
/
posgres.py
127 lines (113 loc) · 5.02 KB
/
posgres.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import mailbox
import psycopg2
import email
insert_counter = 0
spam_counter = 0
trash_counter = 0
def import_mbox_to_postgres(mbox_file_path, db_connection_string):
# Open the MBOX file
mbox = mailbox.mbox(mbox_file_path)
# Iterate over each email
for msg in mbox:
# Establish database connection
with psycopg2.connect(db_connection_string) as conn:
with conn.cursor() as cur:
labels = msg.get('X-Gmail-Labels', '').split(',')
# print(labels)
if "Spam" in labels:
global spam_counter
spam_counter += 1
print(f" s{spam_counter} ", end='')
continue
if "Trash" in labels:
global trash_counter
trash_counter += 1
print(f" t{trash_counter} ", end='')
continue
# Extract relevant information
subject = msg['Subject']
sender = msg['From']
to_recipients = msg.get_all('To', [])
cc_recipients = msg.get_all('Cc', [])
bcc_recipients = msg.get_all('Bcc', [])
msg_date = msg['Date']
if msg_date:
date = email.utils.parsedate_to_datetime(msg_date)
else:
print("\n date none. Sender: ", sender)
print(subject)
print(date)
body = str(part.get_payload(decode=True))
print(body[:128], "\n")
continue
body = ""
# Handle multipart messages
if msg.is_multipart():
for part in msg.walk():
if part.get_content_type() == "text/plain":
try:
body = part.get_payload(decode=True).decode()
except UnicodeDecodeError:
# print("\n\tUnicodeDecodeError: ", sender)
# print(subject[:32], end='\t')
body = str(part.get_payload(decode=True))
# print(body[:64])
else:
try:
body = msg.get_payload(decode=True).decode()
except UnicodeDecodeError:
# print("\n\tUnicodeDecodeError: ", sender)
# print(subject)
# print(date)
body = str(part.get_payload(decode=True))
# print(body[:32])
# Insert into database
try:
cur.execute(
'''
INSERT INTO emails (subject, sender, to_recipients, cc_recipients,
bcc_recipients, body, date, labels)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
''',
(subject, sender, to_recipients, cc_recipients, bcc_recipients, body, date, labels)
)
except ValueError:
print("\n\tValueError: from: ", sender)
print(labels)
print(date)
print(subject)
print(body[:256], "\n")
continue
except psycopg2.ProgrammingError:
print("\n\tpsycopg2.ProgrammingError: from: ", sender)
print("cc: ", cc_recipients, " type: ", type(cc_recipients), " len ", len(cc_recipients))
for cc in cc_recipients:
print("cc: ", cc, " type: ", type(cc))
print(labels)
print(subject)
print(body[:256], "\n")
continue
except psycopg2.errors.InvalidTimeZoneDisplacementValue:
print("\n\tpsycopg2.errors.InvalidTimeZoneDisplacementValue: from: ", sender)
print(labels)
print(date)
print(subject)
print(body[:256], "\n")
continue
except psycopg2.errors.InFailedSqlTransaction:
print("\n\tpsycopg2.errors.InvalidTimeZoneDisplacementValue: from: ", sender)
print(labels)
print(date)
print(subject)
print(body[:256], "\n")
continue
global insert_counter
insert_counter += 1
print(f" {insert_counter} ", end=' ')
# Get database connection string from the user (modify for your specific setup)
# db_connection_string = "postgresql://dcar:y@localhost:<port>/emaildb"
db_connection_string = "postgresql://dcar:y@localhost/emaildb"
# Get MBOX file path from user
mbox_file_path = "all-gmail.mbox"
# Import the emails
import_mbox_to_postgres(mbox_file_path, db_connection_string)