Skip to content

Commit

Permalink
Merge pull request #18 from sodascience/media_added
Browse files Browse the repository at this point in the history
Extend media, location and URL regex
  • Loading branch information
mellelieuwes authored Jul 11, 2022
2 parents b9fec5c + 41c48c8 commit 21de258
Showing 1 changed file with 8 additions and 4 deletions.
12 changes: 8 additions & 4 deletions data_extractor/whatsapp_chat/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,14 @@
import zipfile


URL_PATTERN = r'(https?://\S+)'
LOCATION_PATTERN = r'(Location: https?://\S+)'
ATTACH_FILE_PATTERN = r'(<attached: \S+>)'
URL_PATTERN = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)" \
r"(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|" \
r"(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
LOCATION_PATTERN = r'((L|l)ocation: https?://\S+)|((l|L)ocatie: https?://\S+)|' \
r'(.*(l|L)ive locatie gedeeld.*)|(.*(l|L)ive location shared.*)'
ATTACH_FILE_PATTERN = r'(<attached: \S+>)|(<Media (weggelaten|omitted)>)|' \
r'((afbeelding|GIF|video|image|audio|(s|S)ticker|.*document.*) (weggelaten|omitted))'

FILE_RE = re.compile(r".*.txt$")
HIDDEN_FILE_RE = re.compile(r".*__MACOSX*")

Expand Down Expand Up @@ -220,7 +225,6 @@ def remove_alerts_from_line(r_x, line_df):
Cleaned message string
"""
if re.search(r_x, line_df):
print(line_df[:re.search(r_x, line_df).start()])
return line_df[:re.search(r_x, line_df).start()]
else:
return line_df
Expand Down

0 comments on commit 21de258

Please sign in to comment.